From 1aa70324bc96494801533d7ee880424d4169ea10 Mon Sep 17 00:00:00 2001 From: Aleksey Date: Sat, 18 Nov 2017 17:52:22 +0300 Subject: [PATCH 01/30] + added optimization.h and .gitignore updated --- .gitignore | 2 ++ include/meta/stats/optimization.h | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 include/meta/stats/optimization.h diff --git a/.gitignore b/.gitignore index 9167fd029..73e3a6ff6 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,5 @@ data/cranfield biicode.conf bii/ bin/ +*.pro +*.pro.user \ No newline at end of file diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h new file mode 100644 index 000000000..c623439b6 --- /dev/null +++ b/include/meta/stats/optimization.h @@ -0,0 +1,25 @@ +#ifndef OPTIMIZATION_H +#define OPTIMIZATION_H + +#include "meta/embeddings/word_embeddings.h" + +using namespace meta::embeddings; + +namespace meta +{ +namespace stats +{ +namespace opt +{ + +// first method +double minka_fpi(word_embeddings model){ + for (auto term_it: model.vocab()){ + + } +} + +} +} +} +#endif // OPTIMIZATION_H From 5e3e23c4623805352a357e4caca36b8e64cbe971 Mon Sep 17 00:00:00 2001 From: Aleksey Date: Sun, 19 Nov 2017 13:02:09 +0300 Subject: [PATCH 02/30] [opt] dirichlet_optimizer class, digamma function --- include/meta/stats/optimization.h | 57 +++++++++++++++-- include/meta/stats/statistics.h | 101 ++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+), 6 deletions(-) diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h index c623439b6..5fc13748b 100644 --- a/include/meta/stats/optimization.h +++ b/include/meta/stats/optimization.h @@ -2,8 +2,11 @@ #define OPTIMIZATION_H #include "meta/embeddings/word_embeddings.h" +#include "meta/stats/statistics.h" +using namespace meta::stats; using namespace meta::embeddings; +using namespace meta::util; namespace meta { @@ -12,14 +15,56 @@ namespace stats namespace opt { -// first method -double minka_fpi(word_embeddings model){ - for (auto term_it: model.vocab()){ +aligned_vector get_docs_voc(aligned_vector docs_models){ + aligned_vector docs_voc(); + for (auto m_iter: docs_models){ + // todo } -} + return docs_voc; } -} -} + +class dirichlet_optimizer{ +public: + dirichlet_optimizer(aligned_vector docs_models, int alpha=1){ + this->docs_models_ = docs_models; + this->default_alpha_ = alpha; + + this->docs_voc_ = get_docs_voc(docs_models); + } + + double minka_fpi(){ + double alpha = default_alpha_; + + double nom, denom, + alpha_m, + alpha_dig, alpha_m_dig, + all_words_count, k_words_count; + + // digamma(x) + for (std::string word: voc){ + // todo + } + } + + double minka_newton(){ + // todo + } + + double minka_lou(){ + // todo + } + +private: + double minka_fpi_iters(); + double minka_newton_iters(); + double minka_lou_iters(); + + aligned_vector docs_models_; + aligned_vector docs_voc_; + + double default_alpha_; +}; + #endif // OPTIMIZATION_H diff --git a/include/meta/stats/statistics.h b/include/meta/stats/statistics.h index d6823fed1..168af1245 100644 --- a/include/meta/stats/statistics.h +++ b/include/meta/stats/statistics.h @@ -19,6 +19,107 @@ namespace meta namespace stats { +#ifndef M_PIl +/** The constant Pi in high precision */ +#define M_PIl 3.1415926535897932384626433832795029L +#endif +#ifndef M_GAMMAl +/** Euler's constant in high precision */ +#define M_GAMMAl 0.5772156649015328606065120900824024L +#endif +#ifndef M_LN2l +/** the natural logarithm of 2 in high precision */ +#define M_LN2l 0.6931471805599453094172321214581766L +#endif + +/** The digamma function in long double precision. +* @param x the real value of the argument +* @return the value of the digamma (psi) function at that point +* @author Richard J. Mathar +* @since 2005-11-24 +*/ +long double digamma(long double x) +{ + /* force into the interval 1..3 */ + if( x < 0.0L ) + return digamma(1.0L-x)+M_PIl/tanl(M_PIl*(1.0L-x)) ; /* reflection formula */ + else if( x < 1.0L ) + return digamma(1.0L+x)-1.0L/x ; + else if ( x == 1.0L) + return -M_GAMMAl ; + else if ( x == 2.0L) + return 1.0L-M_GAMMAl ; + else if ( x == 3.0L) + return 1.5L-M_GAMMAl ; + else if ( x > 3.0L) + /* duplication formula */ + return 0.5L*(digamma(x/2.0L)+digamma((x+1.0L)/2.0L))+M_LN2l ; + else + { + /* Just for your information, the following lines contain + * the Maple source code to re-generate the table that is + * eventually becoming the Kncoe[] array below + * interface(prettyprint=0) : + * Digits := 63 : + * r := 0 : + * + * for l from 1 to 60 do + * d := binomial(-1/2,l) : + * r := r+d*(-1)^l*(Zeta(2*l+1) -1) ; + * evalf(r) ; + * print(%,evalf(1+Psi(1)-r)) ; + *o d : + * + * for N from 1 to 28 do + * r := 0 : + * n := N-1 : + * + * for l from iquo(n+3,2) to 70 do + * d := 0 : + * for s from 0 to n+1 do + * d := d+(-1)^s*binomial(n+1,s)*binomial((s-1)/2,l) : + * od : + * if 2*l-n > 1 then + * r := r+d*(-1)^l*(Zeta(2*l-n) -1) : + * fi : + * od : + * print(evalf((-1)^n*2*r)) ; + *od : + *quit : + */ + static long double Kncoe[] = { .30459198558715155634315638246624251L, + .72037977439182833573548891941219706L, -.12454959243861367729528855995001087L, + .27769457331927827002810119567456810e-1L, -.67762371439822456447373550186163070e-2L, + .17238755142247705209823876688592170e-2L, -.44817699064252933515310345718960928e-3L, + .11793660000155572716272710617753373e-3L, -.31253894280980134452125172274246963e-4L, + .83173997012173283398932708991137488e-5L, -.22191427643780045431149221890172210e-5L, + .59302266729329346291029599913617915e-6L, -.15863051191470655433559920279603632e-6L, + .42459203983193603241777510648681429e-7L, -.11369129616951114238848106591780146e-7L, + .304502217295931698401459168423403510e-8L, -.81568455080753152802915013641723686e-9L, + .21852324749975455125936715817306383e-9L, -.58546491441689515680751900276454407e-10L, + .15686348450871204869813586459513648e-10L, -.42029496273143231373796179302482033e-11L, + .11261435719264907097227520956710754e-11L, -.30174353636860279765375177200637590e-12L, + .80850955256389526647406571868193768e-13L, -.21663779809421233144009565199997351e-13L, + .58047634271339391495076374966835526e-14L, -.15553767189204733561108869588173845e-14L, + .41676108598040807753707828039353330e-15L, -.11167065064221317094734023242188463e-15L } ; + + register long double Tn_1 = 1.0L ; /* T_{n-1}(x), started at n=1 */ + register long double Tn = x-2.0L ; /* T_{n}(x) , started at n=1 */ + register long double resul = Kncoe[0] + Kncoe[1]*Tn ; + + x -= 2.0L ; + + for(int n = 2 ; n < sizeof(Kncoe)/sizeof(long double) ;n++) + { + const long double Tn1 = 2.0L * x * Tn - Tn_1 ; /* Chebyshev recursion, Eq. 22.7.4 Abramowitz-Stegun */ + resul += Kncoe[n]*Tn1 ; + Tn_1 = Tn ; + Tn = Tn1 ; + } + return resul ; + } +} + /** * Computation for \f$E_d[f(x)]\f$ where \f$d\f$ is specified by the * `dist` parameter and \f$f(x)\f$ is the `fun` parameter. From b8dbc7d8ebd66c399587ac7bc6abf4c3f57d610c Mon Sep 17 00:00:00 2001 From: Aleksey Date: Sun, 19 Nov 2017 23:16:17 +0300 Subject: [PATCH 03/30] [opt] minka_fpi method draft --- include/meta/stats/optimization.h | 108 ++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 19 deletions(-) diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h index 5fc13748b..cf85b8609 100644 --- a/include/meta/stats/optimization.h +++ b/include/meta/stats/optimization.h @@ -3,10 +3,14 @@ #include "meta/embeddings/word_embeddings.h" #include "meta/stats/statistics.h" +#include "meta/analyzers/featurizer.h" + +#include using namespace meta::stats; using namespace meta::embeddings; using namespace meta::util; +using namespace meta::analyzers; namespace meta { @@ -15,37 +19,85 @@ namespace stats namespace opt { -aligned_vector get_docs_voc(aligned_vector docs_models){ - aligned_vector docs_voc(); +aligned_vector get_docs_sizes(aligned_vector> docs_models){ + aligned_vector docs_sizes; - for (auto m_iter: docs_models){ - // todo + long doc_size; + for (int i = 0; i < docs_models.size(); i++){ + doc_size = 0; + + for (auto word: docs_models[i]){ + doc_size += docs_models[i][word]; + } + + docs_sizes.push_back(doc_size); } - return docs_voc; + return docs_sizes; +} + +feature_map get_ref_voc(aligned_vector> docs_models){ + feature_map ref_voc; + featurizer f(ref_voc); + + for (feature_map doc_model: docs_models){ + for (auto word: doc_model){ + f(word, doc_model[word]); + } + } + + return ref_voc; } class dirichlet_optimizer{ public: - dirichlet_optimizer(aligned_vector docs_models, int alpha=1){ + dirichlet_optimizer(aligned_vector> docs_models, int alpha=1){ this->docs_models_ = docs_models; + this->docs_sizes_ = get_docs_sizes(docs_models); + this->default_alpha_ = alpha; - this->docs_voc_ = get_docs_voc(docs_models); + this->ref_voc_ = get_docs_voc(docs_models); } - double minka_fpi(){ - double alpha = default_alpha_; + typedef std::map text_vector; + + text_vector minka_fpi(double eps=1e-6, int max_iters=100){ + std::map alpha_m; + + // create initial alpa_m vector + for (auto word: ref_voc_){ + alpha_m[word] = default_alpha_ * ref_voc_[word]; + } + + // stoping criteria for the whole vector alpha_m + int vector_iteration = 0; + double l_dist = std::numeric_limits::infinity(); + bool all_optimal = true; + + while (vector_iteration <= max_iters && !all_optimal){ + all_optimal = true; + std::string word_k; + double alpha_m_k, alpha_k, alpha_m_k_new; + + for (auto alpha_m_iter: alpha_m){ + word_k = alpha_m_iter.first; + alpha_m_k = alpha_m_iter.second; + + alpha_k = alpha_m_k / ref_voc_[word_k]; - double nom, denom, - alpha_m, - alpha_dig, alpha_m_dig, - all_words_count, k_words_count; + // make a step and find new alpha_m_k + alpha_m_k_new = minka_fpi_step(word_k, alpha_k, alpha_m_k); - // digamma(x) - for (std::string word: voc){ - // todo + if (!is_optimal(alpha_m_k, alpha_m_k_new)){ + all_optimal = false; + + alpha_m[word_k] = alpha_m_k_new; + } + } } + + return alpha_m; } double minka_newton(){ @@ -57,12 +109,30 @@ class dirichlet_optimizer{ } private: - double minka_fpi_iters(); + double minka_fpi_step(std::string word_k, double alpha_k, double alpha_m_k){ + double nom = 0, denom = 0; + + double alpha_m_k_dig = digamma(alpha_m_k), + alpha_k_dig = digamma(alpha_k); + + long all_words_count, k_words_count; + + for (int d = 0; d < docs_models_.size(); d++){ + nom += digamma(docs_models_[d][word_k] + alpha_m_k) - alpha_m_k_dig; + + denom += digamma(docs_sizes_[d] + alpha_k) - alpha_k_dig; + } + + return alpha_m_k * nom / denom; + } + double minka_newton_iters(); double minka_lou_iters(); - aligned_vector docs_models_; - aligned_vector docs_voc_; + aligned_vector> docs_models_; + aligned_vector docs_sizes_; + + feature_map ref_voc_; double default_alpha_; }; From 3dc03a8caf772a925b8843c7c90c2512317e1306 Mon Sep 17 00:00:00 2001 From: Aleksey Date: Mon, 20 Nov 2017 17:51:05 +0300 Subject: [PATCH 04/30] [opt] optimization.h errors fixed, test without MeTa --- include/meta/stats/opt_test.cpp | 90 +++++++++++++ include/meta/stats/optimization.h | 212 ++++++++++++++++++++++++++---- 2 files changed, 274 insertions(+), 28 deletions(-) create mode 100644 include/meta/stats/opt_test.cpp diff --git a/include/meta/stats/opt_test.cpp b/include/meta/stats/opt_test.cpp new file mode 100644 index 000000000..314873bb4 --- /dev/null +++ b/include/meta/stats/opt_test.cpp @@ -0,0 +1,90 @@ +#include +#include + +#include "optimization.h" + +#ifndef TEST_OPT +#define TEST_OPT 1 +#endif // TEST_OPT + +using namespace meta::stats::opt; + +#ifdef TEST_OPT +int main(){ + feature_map dm1, dm2, dm3; + + dm1["1"] = 8; + dm1["2"] = 3; + + dm2["2"] = 4; + dm2["3"] = 3; + + dm3["3"] = 4; + dm3["4"] = 6; + + std::vector> dms; + dms.push_back(dm1); + dms.push_back(dm2); + dms.push_back(dm3); + + dirichlet_optimizer optimizer(dms); + + auto optimized = optimizer.minka_fpi(); + + for (auto iter: optimized){ + std::cout << iter.first << " " << iter.second << std::endl; + } +} +#else +#include "meta/stats/optimization.h" +#include "meta/analyzers/ngram/ngram_word_analyzer.h" +#include "meta/corpus/document.h" +#include "meta/analyzers/token_stream.h" +#include "meta/analyzers/tokenizers/character_tokenizer.h" + +#include "../tests/create_config.h" +#include "meta/meta.h" + +#include "../src/analyzers/analyzer.cpp" + +using namespace meta::stats::opt; +using namespace meta::analyzers; +using namespace meta::corpus; +using namespace meta::analyzers::tokenizers; +using namespace meta::tests; + +std::unique_ptr make_filter() { + auto line_cfg = create_config("line"); + return default_filter_chain(*line_cfg); +} + + +int main(){ + document doc1(meta::doc_id{47}), doc2(meta::doc_id{48}), doc3(meta::doc_id{49}); + doc1.content("Quaia Quaia Coronoid"); + doc2.content("Dj extra Quaia Quaia"); + doc3.content("Coronoid Coronoid Diagram Dj"); + + character_tokenizer tokenizer; + + tokenizer.set_content("Quaia Quaia Coronoid"); + + std::vector> docs_models; + + ngram_word_analyzer anal(1, make_filter()); + + docs_models.push_back(anal.analyze(doc1)); + docs_models.push_back(anal.analyze(doc2)); + docs_models.push_back(anal.analyze(doc3)); + + dirichlet_optimizer optimizer(docs_models); + + auto res_map = optimizer.minka_fpi(); + + for (auto iter: res_map){ + std::cout << iter.first << " " << iter.second << std::endl; + } + + return 0; +} +#endif // TEST_OPT diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h index cf85b8609..4ee923e46 100644 --- a/include/meta/stats/optimization.h +++ b/include/meta/stats/optimization.h @@ -1,17 +1,129 @@ #ifndef OPTIMIZATION_H #define OPTIMIZATION_H -#include "meta/embeddings/word_embeddings.h" -#include "meta/stats/statistics.h" -#include "meta/analyzers/featurizer.h" +#ifndef TEST_OPT +#define TEST_OPT 1 +#endif // TEST_OPT #include +#include +#include + +#ifndef TEST_OPT +#include "meta/stats/statistics.h" +#include "meta/analyzers/featurizer.h" using namespace meta::stats; -using namespace meta::embeddings; using namespace meta::util; using namespace meta::analyzers; +#else + +#ifndef M_PIl +/** The constant Pi in high precision */ +#define M_PIl 3.1415926535897932384626433832795029L +#endif +#ifndef M_GAMMAl +/** Euler's constant in high precision */ +#define M_GAMMAl 0.5772156649015328606065120900824024L +#endif +#ifndef M_LN2l +/** the natural logarithm of 2 in high precision */ +#define M_LN2l 0.6931471805599453094172321214581766L +#endif + +/** The digamma function in long double precision. +* @param x the real value of the argument +* @return the value of the digamma (psi) function at that point +* @author Richard J. Mathar +* @since 2005-11-24 +*/ +long double digamma(long double x) +{ + /* force into the interval 1..3 */ + if( x < 0.0L ) + return digamma(1.0L-x)+M_PIl/tanl(M_PIl*(1.0L-x)) ; /* reflection formula */ + else if( x < 1.0L ) + return digamma(1.0L+x)-1.0L/x ; + else if ( x == 1.0L) + return -M_GAMMAl ; + else if ( x == 2.0L) + return 1.0L-M_GAMMAl ; + else if ( x == 3.0L) + return 1.5L-M_GAMMAl ; + else if ( x > 3.0L) + /* duplication formula */ + return 0.5L*(digamma(x/2.0L)+digamma((x+1.0L)/2.0L))+M_LN2l ; + else + { + /* Just for your information, the following lines contain + * the Maple source code to re-generate the table that is + * eventually becoming the Kncoe[] array below + * interface(prettyprint=0) : + * Digits := 63 : + * r := 0 : + * + * for l from 1 to 60 do + * d := binomial(-1/2,l) : + * r := r+d*(-1)^l*(Zeta(2*l+1) -1) ; + * evalf(r) ; + * print(%,evalf(1+Psi(1)-r)) ; + *o d : + * + * for N from 1 to 28 do + * r := 0 : + * n := N-1 : + * + * for l from iquo(n+3,2) to 70 do + * d := 0 : + * for s from 0 to n+1 do + * d := d+(-1)^s*binomial(n+1,s)*binomial((s-1)/2,l) : + * od : + * if 2*l-n > 1 then + * r := r+d*(-1)^l*(Zeta(2*l-n) -1) : + * fi : + * od : + * print(evalf((-1)^n*2*r)) ; + *od : + *quit : + */ + static long double Kncoe[] = { .30459198558715155634315638246624251L, + .72037977439182833573548891941219706L, -.12454959243861367729528855995001087L, + .27769457331927827002810119567456810e-1L, -.67762371439822456447373550186163070e-2L, + .17238755142247705209823876688592170e-2L, -.44817699064252933515310345718960928e-3L, + .11793660000155572716272710617753373e-3L, -.31253894280980134452125172274246963e-4L, + .83173997012173283398932708991137488e-5L, -.22191427643780045431149221890172210e-5L, + .59302266729329346291029599913617915e-6L, -.15863051191470655433559920279603632e-6L, + .42459203983193603241777510648681429e-7L, -.11369129616951114238848106591780146e-7L, + .304502217295931698401459168423403510e-8L, -.81568455080753152802915013641723686e-9L, + .21852324749975455125936715817306383e-9L, -.58546491441689515680751900276454407e-10L, + .15686348450871204869813586459513648e-10L, -.42029496273143231373796179302482033e-11L, + .11261435719264907097227520956710754e-11L, -.30174353636860279765375177200637590e-12L, + .80850955256389526647406571868193768e-13L, -.21663779809421233144009565199997351e-13L, + .58047634271339391495076374966835526e-14L, -.15553767189204733561108869588173845e-14L, + .41676108598040807753707828039353330e-15L, -.11167065064221317094734023242188463e-15L } ; + + register long double Tn_1 = 1.0L ; /* T_{n-1}(x), started at n=1 */ + register long double Tn = x-2.0L ; /* T_{n}(x) , started at n=1 */ + register long double resul = Kncoe[0] + Kncoe[1]*Tn ; + + x -= 2.0L ; + + for(int n = 2 ; n < sizeof(Kncoe)/sizeof(long double) ;n++) + { + const long double Tn1 = 2.0L * x * Tn - Tn_1 ; /* Chebyshev recursion, Eq. 22.7.4 Abramowitz-Stegun */ + resul += Kncoe[n]*Tn1 ; + Tn_1 = Tn ; + Tn = Tn1 ; + } + return resul ; + } +} + +template using feature_map = std::map; + +#endif // TEST_OPT + namespace meta { namespace stats @@ -19,15 +131,17 @@ namespace stats namespace opt { -aligned_vector get_docs_sizes(aligned_vector> docs_models){ - aligned_vector docs_sizes; +typedef uint64_t celoe; + +std::vector get_docs_sizes(std::vector> docs_models){ + std::vector docs_sizes; long doc_size; for (int i = 0; i < docs_models.size(); i++){ doc_size = 0; for (auto word: docs_models[i]){ - doc_size += docs_models[i][word]; + doc_size += docs_models[i][word.first]; } docs_sizes.push_back(doc_size); @@ -36,60 +150,93 @@ aligned_vector get_docs_sizes(aligned_vector> docs_model return docs_sizes; } -feature_map get_ref_voc(aligned_vector> docs_models){ - feature_map ref_voc; - featurizer f(ref_voc); +#ifndef TEST_OPT +feature_map get_ref_voc(std::vector> docs_models){ + feature_map ref_voc; + featurizer f(ref_voc); + + for (auto doc_model: docs_models){ + for (auto word: doc_model){ + f(word.key(), word.value()); + } + } + + return ref_voc; +} + +#else + +feature_map get_ref_voc(std::vector> docs_models){ + feature_map ref_voc; - for (feature_map doc_model: docs_models){ + for (auto doc_model: docs_models){ for (auto word: doc_model){ - f(word, doc_model[word]); + ref_voc[word.first] += word.second; } } return ref_voc; } +celoe get_ref_voc_size(feature_map ref_voc){ + celoe ref_voc_size = 0; + + for (auto word: ref_voc){ + ref_voc_size += word.second; + } + + return ref_voc_size; +} + +#endif // TEST_OPT + +#include +using namespace std; + class dirichlet_optimizer{ public: - dirichlet_optimizer(aligned_vector> docs_models, int alpha=1){ - this->docs_models_ = docs_models; + dirichlet_optimizer(std::vector> docs_models, int alpha=1) + { + this->docs_models_.assign(docs_models.begin(), docs_models.end()); this->docs_sizes_ = get_docs_sizes(docs_models); this->default_alpha_ = alpha; - this->ref_voc_ = get_docs_voc(docs_models); - } + this->ref_voc_ = get_ref_voc(docs_models); + this->ref_voc_size_ = get_ref_voc_size(this->ref_voc_); - typedef std::map text_vector; + cout << this->ref_voc_size_ << endl; + } - text_vector minka_fpi(double eps=1e-6, int max_iters=100){ + std::map minka_fpi(double eps=1e-6, int max_iters=100){ std::map alpha_m; // create initial alpa_m vector for (auto word: ref_voc_){ - alpha_m[word] = default_alpha_ * ref_voc_[word]; + alpha_m[word.first] = default_alpha_ * word.second / ref_voc_size_; } // stoping criteria for the whole vector alpha_m int vector_iteration = 0; - double l_dist = std::numeric_limits::infinity(); - bool all_optimal = true; + bool all_optimal; while (vector_iteration <= max_iters && !all_optimal){ all_optimal = true; std::string word_k; double alpha_m_k, alpha_k, alpha_m_k_new; + cout << endl; + for (auto alpha_m_iter: alpha_m){ word_k = alpha_m_iter.first; alpha_m_k = alpha_m_iter.second; - alpha_k = alpha_m_k / ref_voc_[word_k]; + alpha_k = alpha_m_k / ((double)ref_voc_[word_k] / ref_voc_size_); // make a step and find new alpha_m_k alpha_m_k_new = minka_fpi_step(word_k, alpha_k, alpha_m_k); - if (!is_optimal(alpha_m_k, alpha_m_k_new)){ + if (std::abs(alpha_m_k - alpha_m_k_new) > eps){ all_optimal = false; alpha_m[word_k] = alpha_m_k_new; @@ -111,30 +258,39 @@ class dirichlet_optimizer{ private: double minka_fpi_step(std::string word_k, double alpha_k, double alpha_m_k){ double nom = 0, denom = 0; - double alpha_m_k_dig = digamma(alpha_m_k), alpha_k_dig = digamma(alpha_k); long all_words_count, k_words_count; for (int d = 0; d < docs_models_.size(); d++){ + nom += digamma(docs_models_[d][word_k] + alpha_m_k) - alpha_m_k_dig; denom += digamma(docs_sizes_[d] + alpha_k) - alpha_k_dig; + } - return alpha_m_k * nom / denom; + double alpha_m_k_new = alpha_m_k * nom / denom;; + + cout << word_k << " " << alpha_k << " " << alpha_m_k << " " << alpha_m_k / ((double)ref_voc_[word_k] / ref_voc_size_) << " " << alpha_m_k_new << " " << nom << " " << denom << endl; + + return alpha_m_k_new; } double minka_newton_iters(); double minka_lou_iters(); - aligned_vector> docs_models_; - aligned_vector docs_sizes_; + std::vector> docs_models_; + std::vector docs_sizes_; - feature_map ref_voc_; + feature_map ref_voc_; + celoe ref_voc_size_; double default_alpha_; }; +} +} +} #endif // OPTIMIZATION_H From eeb91688b8ee66d630255c42004dd35307f0fd71 Mon Sep 17 00:00:00 2001 From: Aleksey Date: Mon, 20 Nov 2017 19:43:16 +0300 Subject: [PATCH 05/30] [opt] debug output --- include/meta/stats/opt_test.cpp | 6 ++---- include/meta/stats/optimization.h | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/meta/stats/opt_test.cpp b/include/meta/stats/opt_test.cpp index 314873bb4..6bb857362 100644 --- a/include/meta/stats/opt_test.cpp +++ b/include/meta/stats/opt_test.cpp @@ -29,11 +29,9 @@ int main(){ dirichlet_optimizer optimizer(dms); - auto optimized = optimizer.minka_fpi(); + auto optimal_alpha = optimizer.minka_fpi(); - for (auto iter: optimized){ - std::cout << iter.first << " " << iter.second << std::endl; - } + cout << endl << "optimal alpha: " << optimal_alpha; } #else #include "meta/stats/optimization.h" diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h index 4ee923e46..06ab27f14 100644 --- a/include/meta/stats/optimization.h +++ b/include/meta/stats/optimization.h @@ -208,7 +208,7 @@ class dirichlet_optimizer{ cout << this->ref_voc_size_ << endl; } - std::map minka_fpi(double eps=1e-6, int max_iters=100){ + double minka_fpi(double eps=1e-3, int max_iters=100){ std::map alpha_m; // create initial alpa_m vector @@ -225,7 +225,7 @@ class dirichlet_optimizer{ std::string word_k; double alpha_m_k, alpha_k, alpha_m_k_new; - cout << endl; + cout << vector_iteration << endl; for (auto alpha_m_iter: alpha_m){ word_k = alpha_m_iter.first; @@ -242,9 +242,19 @@ class dirichlet_optimizer{ alpha_m[word_k] = alpha_m_k_new; } } + + vector_iteration++; + } + + cout << endl << "Alpha_m for each word:" << endl; + + double optimal_alpha = 0; + for (auto alpha_m_iter: alpha_m){ + cout << alpha_m_iter.first << " " << alpha_m_iter.second << std::endl; + optimal_alpha += alpha_m_iter.second; } - return alpha_m; + return optimal_alpha; } double minka_newton(){ From 766754f9726dd16c7709a7c8716fc1643d59dd74 Mon Sep 17 00:00:00 2001 From: MakKolts Date: Mon, 20 Nov 2017 23:32:35 +0300 Subject: [PATCH 06/30] Adding optimization.cpp --- src/stats/CMakeLists.txt | 2 +- src/stats/optimization.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 src/stats/optimization.cpp diff --git a/src/stats/CMakeLists.txt b/src/stats/CMakeLists.txt index c9872741c..3e09824f2 100644 --- a/src/stats/CMakeLists.txt +++ b/src/stats/CMakeLists.txt @@ -1,6 +1,6 @@ project(meta-stats) -add_library(meta-stats running_stats.cpp) +add_library(meta-stats running_stats.cpp optimization.cpp) target_link_libraries(meta-stats meta-definitions) install(TARGETS meta-stats diff --git a/src/stats/optimization.cpp b/src/stats/optimization.cpp new file mode 100644 index 000000000..87a7f3af4 --- /dev/null +++ b/src/stats/optimization.cpp @@ -0,0 +1 @@ +#include "meta/stats/optimization.h" From e9c99df1d85a12baee46b311ee92e518db1377c4 Mon Sep 17 00:00:00 2001 From: Aleksey Date: Wed, 29 Nov 2017 17:14:58 +0300 Subject: [PATCH 07/30] [opt] classes for methods in dirichlet_prior --- include/meta/index/ranker/dirichlet_prior.h | 23 ++ include/meta/stats/opt_test.cpp | 88 ------ include/meta/stats/optimization.h | 306 -------------------- src/index/ranker/dirichlet_prior.cpp | 2 +- 4 files changed, 24 insertions(+), 395 deletions(-) delete mode 100644 include/meta/stats/opt_test.cpp delete mode 100644 include/meta/stats/optimization.h diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index dfb5aef42..37ff18504 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -71,6 +71,29 @@ class dirichlet_prior : public language_model_ranker const float mu_; }; +class dirichlet_prior_opt : public dirichlet_prior{ + void rank(ranker_context &ctx, uint64_t num_results, const filter_function_type &filter) const override{ + // optimize mu according to ranker_context before ranking + this->optimize_mu(ctx); + + ranking_function::rank(ctx, num_results, filter); + } + + virtual void optimize_mu(const ranker_context &ctx) = 0; +}; + +class digamma_rec: public dirichlet_prior_opt{ + void optimize_mu(const ranker_context &ctx) override; +}; + +class log_approx: public dirichlet_prior_opt{ + void optimize_mu(const ranker_context &ctx) override; +}; + +class mackay_peto: public dirichlet_prior_opt{ + void optimize_mu(const ranker_context &ctx) override; +}; + /** * Specialization of the factory method used to create dirichlet_prior * rankers. diff --git a/include/meta/stats/opt_test.cpp b/include/meta/stats/opt_test.cpp deleted file mode 100644 index 6bb857362..000000000 --- a/include/meta/stats/opt_test.cpp +++ /dev/null @@ -1,88 +0,0 @@ -#include -#include - -#include "optimization.h" - -#ifndef TEST_OPT -#define TEST_OPT 1 -#endif // TEST_OPT - -using namespace meta::stats::opt; - -#ifdef TEST_OPT -int main(){ - feature_map dm1, dm2, dm3; - - dm1["1"] = 8; - dm1["2"] = 3; - - dm2["2"] = 4; - dm2["3"] = 3; - - dm3["3"] = 4; - dm3["4"] = 6; - - std::vector> dms; - dms.push_back(dm1); - dms.push_back(dm2); - dms.push_back(dm3); - - dirichlet_optimizer optimizer(dms); - - auto optimal_alpha = optimizer.minka_fpi(); - - cout << endl << "optimal alpha: " << optimal_alpha; -} -#else -#include "meta/stats/optimization.h" -#include "meta/analyzers/ngram/ngram_word_analyzer.h" -#include "meta/corpus/document.h" -#include "meta/analyzers/token_stream.h" -#include "meta/analyzers/tokenizers/character_tokenizer.h" - -#include "../tests/create_config.h" -#include "meta/meta.h" - -#include "../src/analyzers/analyzer.cpp" - -using namespace meta::stats::opt; -using namespace meta::analyzers; -using namespace meta::corpus; -using namespace meta::analyzers::tokenizers; -using namespace meta::tests; - -std::unique_ptr make_filter() { - auto line_cfg = create_config("line"); - return default_filter_chain(*line_cfg); -} - - -int main(){ - document doc1(meta::doc_id{47}), doc2(meta::doc_id{48}), doc3(meta::doc_id{49}); - doc1.content("Quaia Quaia Coronoid"); - doc2.content("Dj extra Quaia Quaia"); - doc3.content("Coronoid Coronoid Diagram Dj"); - - character_tokenizer tokenizer; - - tokenizer.set_content("Quaia Quaia Coronoid"); - - std::vector> docs_models; - - ngram_word_analyzer anal(1, make_filter()); - - docs_models.push_back(anal.analyze(doc1)); - docs_models.push_back(anal.analyze(doc2)); - docs_models.push_back(anal.analyze(doc3)); - - dirichlet_optimizer optimizer(docs_models); - - auto res_map = optimizer.minka_fpi(); - - for (auto iter: res_map){ - std::cout << iter.first << " " << iter.second << std::endl; - } - - return 0; -} -#endif // TEST_OPT diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h deleted file mode 100644 index 06ab27f14..000000000 --- a/include/meta/stats/optimization.h +++ /dev/null @@ -1,306 +0,0 @@ -#ifndef OPTIMIZATION_H -#define OPTIMIZATION_H - -#ifndef TEST_OPT -#define TEST_OPT 1 -#endif // TEST_OPT - -#include -#include -#include - -#ifndef TEST_OPT -#include "meta/stats/statistics.h" -#include "meta/analyzers/featurizer.h" - -using namespace meta::stats; -using namespace meta::util; -using namespace meta::analyzers; - -#else - -#ifndef M_PIl -/** The constant Pi in high precision */ -#define M_PIl 3.1415926535897932384626433832795029L -#endif -#ifndef M_GAMMAl -/** Euler's constant in high precision */ -#define M_GAMMAl 0.5772156649015328606065120900824024L -#endif -#ifndef M_LN2l -/** the natural logarithm of 2 in high precision */ -#define M_LN2l 0.6931471805599453094172321214581766L -#endif - -/** The digamma function in long double precision. -* @param x the real value of the argument -* @return the value of the digamma (psi) function at that point -* @author Richard J. Mathar -* @since 2005-11-24 -*/ -long double digamma(long double x) -{ - /* force into the interval 1..3 */ - if( x < 0.0L ) - return digamma(1.0L-x)+M_PIl/tanl(M_PIl*(1.0L-x)) ; /* reflection formula */ - else if( x < 1.0L ) - return digamma(1.0L+x)-1.0L/x ; - else if ( x == 1.0L) - return -M_GAMMAl ; - else if ( x == 2.0L) - return 1.0L-M_GAMMAl ; - else if ( x == 3.0L) - return 1.5L-M_GAMMAl ; - else if ( x > 3.0L) - /* duplication formula */ - return 0.5L*(digamma(x/2.0L)+digamma((x+1.0L)/2.0L))+M_LN2l ; - else - { - /* Just for your information, the following lines contain - * the Maple source code to re-generate the table that is - * eventually becoming the Kncoe[] array below - * interface(prettyprint=0) : - * Digits := 63 : - * r := 0 : - * - * for l from 1 to 60 do - * d := binomial(-1/2,l) : - * r := r+d*(-1)^l*(Zeta(2*l+1) -1) ; - * evalf(r) ; - * print(%,evalf(1+Psi(1)-r)) ; - *o d : - * - * for N from 1 to 28 do - * r := 0 : - * n := N-1 : - * - * for l from iquo(n+3,2) to 70 do - * d := 0 : - * for s from 0 to n+1 do - * d := d+(-1)^s*binomial(n+1,s)*binomial((s-1)/2,l) : - * od : - * if 2*l-n > 1 then - * r := r+d*(-1)^l*(Zeta(2*l-n) -1) : - * fi : - * od : - * print(evalf((-1)^n*2*r)) ; - *od : - *quit : - */ - static long double Kncoe[] = { .30459198558715155634315638246624251L, - .72037977439182833573548891941219706L, -.12454959243861367729528855995001087L, - .27769457331927827002810119567456810e-1L, -.67762371439822456447373550186163070e-2L, - .17238755142247705209823876688592170e-2L, -.44817699064252933515310345718960928e-3L, - .11793660000155572716272710617753373e-3L, -.31253894280980134452125172274246963e-4L, - .83173997012173283398932708991137488e-5L, -.22191427643780045431149221890172210e-5L, - .59302266729329346291029599913617915e-6L, -.15863051191470655433559920279603632e-6L, - .42459203983193603241777510648681429e-7L, -.11369129616951114238848106591780146e-7L, - .304502217295931698401459168423403510e-8L, -.81568455080753152802915013641723686e-9L, - .21852324749975455125936715817306383e-9L, -.58546491441689515680751900276454407e-10L, - .15686348450871204869813586459513648e-10L, -.42029496273143231373796179302482033e-11L, - .11261435719264907097227520956710754e-11L, -.30174353636860279765375177200637590e-12L, - .80850955256389526647406571868193768e-13L, -.21663779809421233144009565199997351e-13L, - .58047634271339391495076374966835526e-14L, -.15553767189204733561108869588173845e-14L, - .41676108598040807753707828039353330e-15L, -.11167065064221317094734023242188463e-15L } ; - - register long double Tn_1 = 1.0L ; /* T_{n-1}(x), started at n=1 */ - register long double Tn = x-2.0L ; /* T_{n}(x) , started at n=1 */ - register long double resul = Kncoe[0] + Kncoe[1]*Tn ; - - x -= 2.0L ; - - for(int n = 2 ; n < sizeof(Kncoe)/sizeof(long double) ;n++) - { - const long double Tn1 = 2.0L * x * Tn - Tn_1 ; /* Chebyshev recursion, Eq. 22.7.4 Abramowitz-Stegun */ - resul += Kncoe[n]*Tn1 ; - Tn_1 = Tn ; - Tn = Tn1 ; - } - return resul ; - } -} - -template using feature_map = std::map; - -#endif // TEST_OPT - -namespace meta -{ -namespace stats -{ -namespace opt -{ - -typedef uint64_t celoe; - -std::vector get_docs_sizes(std::vector> docs_models){ - std::vector docs_sizes; - - long doc_size; - for (int i = 0; i < docs_models.size(); i++){ - doc_size = 0; - - for (auto word: docs_models[i]){ - doc_size += docs_models[i][word.first]; - } - - docs_sizes.push_back(doc_size); - } - - return docs_sizes; -} - -#ifndef TEST_OPT -feature_map get_ref_voc(std::vector> docs_models){ - feature_map ref_voc; - featurizer f(ref_voc); - - for (auto doc_model: docs_models){ - for (auto word: doc_model){ - f(word.key(), word.value()); - } - } - - return ref_voc; -} - -#else - -feature_map get_ref_voc(std::vector> docs_models){ - feature_map ref_voc; - - for (auto doc_model: docs_models){ - for (auto word: doc_model){ - ref_voc[word.first] += word.second; - } - } - - return ref_voc; -} - -celoe get_ref_voc_size(feature_map ref_voc){ - celoe ref_voc_size = 0; - - for (auto word: ref_voc){ - ref_voc_size += word.second; - } - - return ref_voc_size; -} - -#endif // TEST_OPT - -#include -using namespace std; - -class dirichlet_optimizer{ -public: - dirichlet_optimizer(std::vector> docs_models, int alpha=1) - { - this->docs_models_.assign(docs_models.begin(), docs_models.end()); - this->docs_sizes_ = get_docs_sizes(docs_models); - - this->default_alpha_ = alpha; - - this->ref_voc_ = get_ref_voc(docs_models); - this->ref_voc_size_ = get_ref_voc_size(this->ref_voc_); - - cout << this->ref_voc_size_ << endl; - } - - double minka_fpi(double eps=1e-3, int max_iters=100){ - std::map alpha_m; - - // create initial alpa_m vector - for (auto word: ref_voc_){ - alpha_m[word.first] = default_alpha_ * word.second / ref_voc_size_; - } - - // stoping criteria for the whole vector alpha_m - int vector_iteration = 0; - bool all_optimal; - - while (vector_iteration <= max_iters && !all_optimal){ - all_optimal = true; - std::string word_k; - double alpha_m_k, alpha_k, alpha_m_k_new; - - cout << vector_iteration << endl; - - for (auto alpha_m_iter: alpha_m){ - word_k = alpha_m_iter.first; - alpha_m_k = alpha_m_iter.second; - - alpha_k = alpha_m_k / ((double)ref_voc_[word_k] / ref_voc_size_); - - // make a step and find new alpha_m_k - alpha_m_k_new = minka_fpi_step(word_k, alpha_k, alpha_m_k); - - if (std::abs(alpha_m_k - alpha_m_k_new) > eps){ - all_optimal = false; - - alpha_m[word_k] = alpha_m_k_new; - } - } - - vector_iteration++; - } - - cout << endl << "Alpha_m for each word:" << endl; - - double optimal_alpha = 0; - for (auto alpha_m_iter: alpha_m){ - cout << alpha_m_iter.first << " " << alpha_m_iter.second << std::endl; - optimal_alpha += alpha_m_iter.second; - } - - return optimal_alpha; - } - - double minka_newton(){ - // todo - } - - double minka_lou(){ - // todo - } - -private: - double minka_fpi_step(std::string word_k, double alpha_k, double alpha_m_k){ - double nom = 0, denom = 0; - double alpha_m_k_dig = digamma(alpha_m_k), - alpha_k_dig = digamma(alpha_k); - - long all_words_count, k_words_count; - - for (int d = 0; d < docs_models_.size(); d++){ - - nom += digamma(docs_models_[d][word_k] + alpha_m_k) - alpha_m_k_dig; - - denom += digamma(docs_sizes_[d] + alpha_k) - alpha_k_dig; - - } - - double alpha_m_k_new = alpha_m_k * nom / denom;; - - cout << word_k << " " << alpha_k << " " << alpha_m_k << " " << alpha_m_k / ((double)ref_voc_[word_k] / ref_voc_size_) << " " << alpha_m_k_new << " " << nom << " " << denom << endl; - - return alpha_m_k_new; - } - - double minka_newton_iters(); - double minka_lou_iters(); - - std::vector> docs_models_; - std::vector docs_sizes_; - - feature_map ref_voc_; - celoe ref_voc_size_; - - double default_alpha_; -}; -} -} -} - -#endif // OPTIMIZATION_H diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp index 07536afbe..76230043e 100644 --- a/src/index/ranker/dirichlet_prior.cpp +++ b/src/index/ranker/dirichlet_prior.cpp @@ -33,7 +33,7 @@ void dirichlet_prior::save(std::ostream& out) const io::packed::write(out, mu_); } -float dirichlet_prior::smoothed_prob(const score_data& sd) const +float dirichlet_prior::smoothed_prob(score_data& sd) { float pc = static_cast(sd.corpus_term_count) / sd.total_terms; float numerator = sd.doc_term_count + mu_ * pc; From 54d727205163a07f583694f3582a8adb422337c7 Mon Sep 17 00:00:00 2001 From: M Date: Wed, 29 Nov 2017 17:23:44 +0300 Subject: [PATCH 08/30] Deletion of previous stuff --- src/stats/optimization.cpp | 1 - 1 file changed, 1 deletion(-) delete mode 100644 src/stats/optimization.cpp diff --git a/src/stats/optimization.cpp b/src/stats/optimization.cpp deleted file mode 100644 index 87a7f3af4..000000000 --- a/src/stats/optimization.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "meta/stats/optimization.h" From 65851899819645fa5cd33f4f560ef192e1b87441 Mon Sep 17 00:00:00 2001 From: M Date: Wed, 29 Nov 2017 18:54:57 +0300 Subject: [PATCH 09/30] Test for dirichlet optimizations --- include/meta/index/ranker/dirichlet_prior.h | 21 ++++++++++++++------- src/index/ranker/CMakeLists.txt | 2 ++ src/index/ranker/dirichlet_prior.cpp | 2 +- src/index/ranker/test_opt/CMakeLists.txt | 8 ++++++++ src/index/ranker/test_opt/test.cpp | 12 ++++++++++++ src/stats/CMakeLists.txt | 2 +- 6 files changed, 38 insertions(+), 9 deletions(-) create mode 100644 src/index/ranker/test_opt/CMakeLists.txt create mode 100644 src/index/ranker/test_opt/test.cpp diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index 37ff18504..0f3bc3bc7 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -72,26 +72,33 @@ class dirichlet_prior : public language_model_ranker }; class dirichlet_prior_opt : public dirichlet_prior{ - void rank(ranker_context &ctx, uint64_t num_results, const filter_function_type &filter) const override{ +// void rank(ranker_context &ctx, uint64_t num_results, const filter_function_type &filter) const override{ +// ranking_function::rank(ctx, num_results, filter); +// } + template + std::vector score(inverted_index& idx, ForwardIterator begin, + ForwardIterator end, + uint64_t num_results = 10) + { // optimize mu according to ranker_context before ranking - this->optimize_mu(ctx); + this->optimize_mu(idx); - ranking_function::rank(ctx, num_results, filter); + return ranker::score(idx, begin, end, num_results); } - virtual void optimize_mu(const ranker_context &ctx) = 0; + virtual void optimize_mu(const inverted_index& idx) = 0; }; class digamma_rec: public dirichlet_prior_opt{ - void optimize_mu(const ranker_context &ctx) override; + void optimize_mu(const inverted_index& idx) override; }; class log_approx: public dirichlet_prior_opt{ - void optimize_mu(const ranker_context &ctx) override; + void optimize_mu(const inverted_index& idx) override; }; class mackay_peto: public dirichlet_prior_opt{ - void optimize_mu(const ranker_context &ctx) override; + void optimize_mu(const inverted_index& idx) override; }; /** diff --git a/src/index/ranker/CMakeLists.txt b/src/index/ranker/CMakeLists.txt index 20518f751..e386d54b6 100644 --- a/src/index/ranker/CMakeLists.txt +++ b/src/index/ranker/CMakeLists.txt @@ -1,5 +1,7 @@ project(meta-ranker) +add_subdirectory(test_opt) + add_library(meta-ranker absolute_discount.cpp dirichlet_prior.cpp jelinek_mercer.cpp diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp index 76230043e..07536afbe 100644 --- a/src/index/ranker/dirichlet_prior.cpp +++ b/src/index/ranker/dirichlet_prior.cpp @@ -33,7 +33,7 @@ void dirichlet_prior::save(std::ostream& out) const io::packed::write(out, mu_); } -float dirichlet_prior::smoothed_prob(score_data& sd) +float dirichlet_prior::smoothed_prob(const score_data& sd) const { float pc = static_cast(sd.corpus_term_count) / sd.total_terms; float numerator = sd.doc_term_count + mu_ * pc; diff --git a/src/index/ranker/test_opt/CMakeLists.txt b/src/index/ranker/test_opt/CMakeLists.txt new file mode 100644 index 000000000..e03d4f7f9 --- /dev/null +++ b/src/index/ranker/test_opt/CMakeLists.txt @@ -0,0 +1,8 @@ +project(meta-dirichlet-test) + +include_directories(../../../../include) + +add_executable(test_opt test.cpp) + +target_link_libraries(test_opt meta-ranker) + diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp new file mode 100644 index 000000000..e48e078a8 --- /dev/null +++ b/src/index/ranker/test_opt/test.cpp @@ -0,0 +1,12 @@ +#include "meta/corpus/document.h" +#include "meta/index/ranker/all.h" +#include "meta/index/forward_index.h" + +#include + + +int main(){ + + std::cout << "Quaia!" << std::endl; + meta::index::dirichlet_prior ranker; +} diff --git a/src/stats/CMakeLists.txt b/src/stats/CMakeLists.txt index 3e09824f2..c9872741c 100644 --- a/src/stats/CMakeLists.txt +++ b/src/stats/CMakeLists.txt @@ -1,6 +1,6 @@ project(meta-stats) -add_library(meta-stats running_stats.cpp optimization.cpp) +add_library(meta-stats running_stats.cpp) target_link_libraries(meta-stats meta-definitions) install(TARGETS meta-stats From c0a357cf363163735fac893fd7de62031dda1d28 Mon Sep 17 00:00:00 2001 From: M Date: Wed, 29 Nov 2017 20:38:59 +0300 Subject: [PATCH 10/30] Private/public methods --- include/meta/index/ranker/dirichlet_prior.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index 0f3bc3bc7..63f44fb2c 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -75,6 +75,7 @@ class dirichlet_prior_opt : public dirichlet_prior{ // void rank(ranker_context &ctx, uint64_t num_results, const filter_function_type &filter) const override{ // ranking_function::rank(ctx, num_results, filter); // } +public: template std::vector score(inverted_index& idx, ForwardIterator begin, ForwardIterator end, @@ -85,7 +86,7 @@ class dirichlet_prior_opt : public dirichlet_prior{ return ranker::score(idx, begin, end, num_results); } - +private: virtual void optimize_mu(const inverted_index& idx) = 0; }; From 76d32aed62081f68e675f698359ece9976fb547d Mon Sep 17 00:00:00 2001 From: Aleksey Date: Wed, 29 Nov 2017 20:56:14 +0300 Subject: [PATCH 11/30] [opt] test indexes --- src/index/ranker/test_opt/test.cpp | 44 ++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp index e48e078a8..c6dc0de61 100644 --- a/src/index/ranker/test_opt/test.cpp +++ b/src/index/ranker/test_opt/test.cpp @@ -4,9 +4,47 @@ #include +#include "meta/index/inverted_index.h" +#include "meta/logging/logger.h" +#include "meta/parser/analyzers/tree_analyzer.h" +#include "meta/sequence/analyzers/ngram_pos_analyzer.h" +#include "meta/util/time.h" -int main(){ +using namespace meta; - std::cout << "Quaia!" << std::endl; - meta::index::dirichlet_prior ranker; + +int main(int argc, char* argv[]) +{ + if (argc != 2) + { + std::cerr << "Usage:\t" << argv[0] << " configFile" << std::endl; + return 1; + } + + // Turn on logging to std::cerr. + logging::set_cerr_logging(); + + // Register additional analyzers + parser::register_analyzers(); + sequence::register_analyzers(); + + // Time how long it takes to create the index. By default, common::time's + // unit of measurement is milliseconds. + auto time = common::time([&]() + { + // Creates an inverted index with no cache. We don't need a cache here + // since we're never searching the index, only building it. + auto config = cpptoml::parse_file(argv[1]); + auto idx = index::make_index(*config); + + // Print out some data about the corpus. + std::cout << "Number of documents: " << idx->num_docs() << std::endl; + std::cout << "Avg Doc Length: " << idx->avg_doc_length() << std::endl; + std::cout << "Unique Terms: " << idx->unique_terms() << std::endl; + }); + + std::cout << "Index generation took: " << time.count() / 1000.0 + << " seconds" << std::endl; + + return 0; } From 4ccda58e02e2c50e5efaff27d3609056334a8fed Mon Sep 17 00:00:00 2001 From: M Date: Wed, 29 Nov 2017 23:15:35 +0300 Subject: [PATCH 12/30] Interface for methods --- include/meta/index/ranker/dirichlet_prior.h | 20 ++++++++++++-------- src/index/ranker/test_opt/CMakeLists.txt | 4 +++- src/index/ranker/test_opt/test.cpp | 9 ++++----- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index 63f44fb2c..ecd3b0da0 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -66,15 +66,13 @@ class dirichlet_prior : public language_model_ranker */ float doc_constant(const score_data& sd) const override; - private: + protected: /// the Dirichlet prior parameter - const float mu_; +// const float mu_; + float mu_; }; class dirichlet_prior_opt : public dirichlet_prior{ -// void rank(ranker_context &ctx, uint64_t num_results, const filter_function_type &filter) const override{ -// ranking_function::rank(ctx, num_results, filter); -// } public: template std::vector score(inverted_index& idx, ForwardIterator begin, @@ -86,20 +84,26 @@ class dirichlet_prior_opt : public dirichlet_prior{ return ranker::score(idx, begin, end, num_results); } + + float get_optimized_mu(const inverted_index& idx) { + optimize_mu(idx); + return mu_; + } + private: virtual void optimize_mu(const inverted_index& idx) = 0; }; class digamma_rec: public dirichlet_prior_opt{ - void optimize_mu(const inverted_index& idx) override; + void optimize_mu(const inverted_index& idx) override { mu_ = 0;}; }; class log_approx: public dirichlet_prior_opt{ - void optimize_mu(const inverted_index& idx) override; + void optimize_mu(const inverted_index& idx) override { mu_ = 0;}; }; class mackay_peto: public dirichlet_prior_opt{ - void optimize_mu(const inverted_index& idx) override; + void optimize_mu(const inverted_index& idx) override { mu_ = 0;}; }; /** diff --git a/src/index/ranker/test_opt/CMakeLists.txt b/src/index/ranker/test_opt/CMakeLists.txt index e03d4f7f9..1e8cdc0ee 100644 --- a/src/index/ranker/test_opt/CMakeLists.txt +++ b/src/index/ranker/test_opt/CMakeLists.txt @@ -4,5 +4,7 @@ include_directories(../../../../include) add_executable(test_opt test.cpp) -target_link_libraries(test_opt meta-ranker) +target_link_libraries(test_opt meta-ranker + meta-sequence-analyzers + meta-parser-analyzers) diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp index c6dc0de61..7fc4e5e0b 100644 --- a/src/index/ranker/test_opt/test.cpp +++ b/src/index/ranker/test_opt/test.cpp @@ -37,13 +37,12 @@ int main(int argc, char* argv[]) auto config = cpptoml::parse_file(argv[1]); auto idx = index::make_index(*config); - // Print out some data about the corpus. - std::cout << "Number of documents: " << idx->num_docs() << std::endl; - std::cout << "Avg Doc Length: " << idx->avg_doc_length() << std::endl; - std::cout << "Unique Terms: " << idx->unique_terms() << std::endl; + // Create and make score of optimizer + index::digamma_rec ranker; + std::cout << ranker.get_optimized_mu(*idx) << std::endl; }); - std::cout << "Index generation took: " << time.count() / 1000.0 + std::cout << "Method took: " << time.count() / 1000.0 << " seconds" << std::endl; return 0; From 248c1515dee3c7023228e27d92ab6d706344e03e Mon Sep 17 00:00:00 2001 From: M Date: Thu, 30 Nov 2017 00:22:50 +0300 Subject: [PATCH 13/30] Refactoring of optimization interface --- include/meta/index/ranker/dirichlet_prior.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index ecd3b0da0..b53bd86ff 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -86,11 +86,16 @@ class dirichlet_prior_opt : public dirichlet_prior{ } float get_optimized_mu(const inverted_index& idx) { - optimize_mu(idx); + optimize(idx); return mu_; } private: + void optimize(const inverted_index& idx) { + doc_id y = idx.docs()[0]; + + } + virtual void optimize_mu(const inverted_index& idx) = 0; }; From 61ece78e0c0b640383a25428e4b4c72852950007 Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Thu, 30 Nov 2017 00:24:15 +0300 Subject: [PATCH 14/30] [opt] tmp for merge --- include/meta/index/ranker/dirichlet_prior.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index ecd3b0da0..a9651607c 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -91,7 +91,12 @@ class dirichlet_prior_opt : public dirichlet_prior{ } private: - virtual void optimize_mu(const inverted_index& idx) = 0; + void optimize_mu(const inverted_index& idx){ + // TODO: parse idx + idx.term_freq; + idx.doc_size; + idx. + } }; class digamma_rec: public dirichlet_prior_opt{ From f9792648e6b67c58a280b10bfe907da478aa5cc8 Mon Sep 17 00:00:00 2001 From: M Date: Thu, 30 Nov 2017 01:53:08 +0300 Subject: [PATCH 15/30] Tests for all functions at same time --- include/meta/index/ranker/dirichlet_prior.h | 1 - src/index/ranker/test_opt/test.cpp | 32 +++++++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index b53bd86ff..c7d3dc9bf 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -92,7 +92,6 @@ class dirichlet_prior_opt : public dirichlet_prior{ private: void optimize(const inverted_index& idx) { - doc_id y = idx.docs()[0]; } diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp index 7fc4e5e0b..b85fdbfca 100644 --- a/src/index/ranker/test_opt/test.cpp +++ b/src/index/ranker/test_opt/test.cpp @@ -28,21 +28,41 @@ int main(int argc, char* argv[]) parser::register_analyzers(); sequence::register_analyzers(); + // Creates an inverted index with no cache. We don't need a cache here + // since we're never searching the index, only building it. + auto config = cpptoml::parse_file(argv[1]); + auto idx = index::make_index(*config); + // Time how long it takes to create the index. By default, common::time's // unit of measurement is milliseconds. auto time = common::time([&]() { - // Creates an inverted index with no cache. We don't need a cache here - // since we're never searching the index, only building it. - auto config = cpptoml::parse_file(argv[1]); - auto idx = index::make_index(*config); - // Create and make score of optimizer index::digamma_rec ranker; std::cout << ranker.get_optimized_mu(*idx) << std::endl; }); - std::cout << "Method took: " << time.count() / 1000.0 + std::cout << "Method DR took: " << time.count() / 1000.0 + << " seconds" << std::endl; + + time = common::time([&]() + { + // Create and make score of optimizer + index::log_approx ranker; + std::cout << ranker.get_optimized_mu(*idx) << std::endl; + }); + + std::cout << "Method LA took: " << time.count() / 1000.0 + << " seconds" << std::endl; + + time = common::time([&]() + { + // Create and make score of optimizer + index::mackay_peto ranker; + std::cout << ranker.get_optimized_mu(*idx) << std::endl; + }); + + std::cout << "Method MP took: " << time.count() / 1000.0 << " seconds" << std::endl; return 0; From ed475b5339e7dd8dd4012a55f19e9948aaf26a1c Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Thu, 30 Nov 2017 04:23:07 +0300 Subject: [PATCH 16/30] [opt] + first method without testing --- include/meta/index/ranker/dirichlet_prior.h | 135 ++++++++++++++++++-- 1 file changed, 124 insertions(+), 11 deletions(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index f0a236c6b..a11a0a2fe 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -74,6 +74,32 @@ class dirichlet_prior : public language_model_ranker float mu_; }; +struct docs_data +{ + // general info + + inverted_index& idx; + /// ids of all documents + std::vector doc_ids; + /// ids of all terms + std::vector term_ids; + + /** + * Constructor to initialize most elements. + * @param p_idx The index that is being used + * @param p_doc_ids ids of all docs + * @param p_term_ids ids of all terms + */ + score_data(inverted_index& p_idx, std::vector p_doc_ids, std::vector p_term_ids, + uint64_t p_total_terms, float p_query_length) + : idx(p_idx), // gcc no non-const ref init from brace init list + doc_ids{p_doc_ids}, + term_ids{p_term_ids} + { + /* nothing */ + } +}; + class dirichlet_prior_opt : public dirichlet_prior{ public: template @@ -88,34 +114,121 @@ class dirichlet_prior_opt : public dirichlet_prior{ } float get_optimized_mu(const inverted_index& idx) { - optimize(idx); + optimize_mu(idx); + return mu_; } private: - void optimize(const inverted_index& idx) { - // TODO: parse idx + void optimize_mu(const inverted_index& idx) { auto docs_ids = idx.docs(); auto terms_ids = idx.terms(); + docs_data dd{idx, docs_ids, terms_ids}; - std::cout << idx.unique_terms() << std::endl; + optimize_mu(dd); +// std::cout << idx.unique_terms() << std::endl; - for (auto d_id: docs_ids){ - for (auto t_id: terms_ids){ - std::cout << idx.term_freq(t_id, d_id) << std::endl; - } - } +// for (auto d_id: docs_ids){ +// for (auto t_id: terms_ids){ +// std::cout << idx.term_freq(t_id, d_id) << std::endl; +// } +// } +// optimize_mu(std::vector docs_ids, // idx.unique_terms() // idx.total_corpus_terms() } - virtual void optimize_mu(const inverted_index& idx) = 0; + virtual void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) = 0; }; +// # TODO: choose template type instead of long +typedef long count_d; + class digamma_rec: public dirichlet_prior_opt{ - void optimize_mu(const inverted_index& idx) override { mu_ = 0;}; + void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override { + // fill C_.(n) and C_k(n) + + std::map docs_counts; + std::map> terms_docs_counts; + long doc_size, doc_term_freq; + + for (auto d_id: dd.doc_ids){ + doc_size = dd.idx.doc_size(d_id); + + //// increase number of docs with the given size (C_.(n)) + docs_counts[doc_size] += 1; + + for (auto t_id: dd.idx.terms(d_id)){ + doc_term_freq = dd.idx.term_freq(t_id, d_id); + + //// increase number of docs with the given count of word t_id (C_k(n)) + terms_docs_counts[t_id][doc_term_freq] += 1; + } + } + +// // sort by ascending of occurences +// std::sort(docs_counts.begin(), items.end()); +// for (auto key: terms_docs_counts){ +// std::sort(key.second.begin(), key.second.end()); +// } + + // p(w|REF) = dd.idx.total_num_occurences(t_id) + + // fill start vector alpha_m + double alpha = 1; + std::map alpha_m; + + for (auto t_id: dd.idx.terms()){ + alpha_m[t_id] = dd.idx.total_num_occurences(t_id) * alpha; + } + + double D, S; + bool converged = false; + + while (!converged){ + D = 0; + S = 0; + + alpha = 0; + for (auto alpha_m_k: alpha_m){ + alpha += alpha_m_k; + } + + count_d n, c_d; + for (auto kv: docs_counts){ + n = kv.first; + c_d = kv.second; + + D += 1/(n - 1 + alpha); + S += c_d * D; + } + + std::map c_n; + term_id k; + double S_k; + for (auto kv: terms_docs_counts){ + k = kv.first; + c_n = kv.second; + + D = 0; + S_k = 0; + + count_d n, c_k_n; + for (auto kv_: c_n){ + n = kv_.first; + c_k_n = kv_.second; + + D += 1/(n - 1 + alpha * m_k); + S_k += c_k_n * D; + } + + alpha_m[k] *= S_k / S; + } + } + + } }; class log_approx: public dirichlet_prior_opt{ From 4528ec667f59e9900fd64a0b283357be86e05806 Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Thu, 30 Nov 2017 04:43:19 +0300 Subject: [PATCH 17/30] [opt] *first method builds --- include/meta/index/ranker/dirichlet_prior.h | 46 +++++++++++++++------ src/index/ranker/test_opt/test.cpp | 28 ++++++------- 2 files changed, 47 insertions(+), 27 deletions(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index a11a0a2fe..702d28638 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -12,6 +12,7 @@ #include "meta/index/ranker/lm_ranker.h" #include "meta/index/ranker/ranker_factory.h" +#include #include namespace meta @@ -78,7 +79,7 @@ struct docs_data { // general info - inverted_index& idx; + const inverted_index& idx; /// ids of all documents std::vector doc_ids; /// ids of all terms @@ -90,8 +91,7 @@ struct docs_data * @param p_doc_ids ids of all docs * @param p_term_ids ids of all terms */ - score_data(inverted_index& p_idx, std::vector p_doc_ids, std::vector p_term_ids, - uint64_t p_total_terms, float p_query_length) + docs_data(const inverted_index& p_idx, std::vector p_doc_ids, std::vector p_term_ids) : idx(p_idx), // gcc no non-const ref init from brace init list doc_ids{p_doc_ids}, term_ids{p_term_ids} @@ -119,11 +119,22 @@ class dirichlet_prior_opt : public dirichlet_prior{ return mu_; } +protected: + inline double get_alpha(std::map alpha_m){ + double alpha = 0; + + for (auto alpha_m_k: alpha_m){ + alpha += alpha_m_k.second; + } + + return alpha; + } + private: void optimize_mu(const inverted_index& idx) { auto docs_ids = idx.docs(); auto terms_ids = idx.terms(); - docs_data dd{idx, docs_ids, terms_ids}; + docs_data dd{idx, docs_ids, terms_ids}; optimize_mu(dd); // std::cout << idx.unique_terms() << std::endl; @@ -177,7 +188,7 @@ class digamma_rec: public dirichlet_prior_opt{ // p(w|REF) = dd.idx.total_num_occurences(t_id) // fill start vector alpha_m - double alpha = 1; + double alpha = 1, alpha_mk_new; std::map alpha_m; for (auto t_id: dd.idx.terms()){ @@ -185,16 +196,15 @@ class digamma_rec: public dirichlet_prior_opt{ } double D, S; - bool converged = false; + bool all_optimized = false; + int iters_num = 0; - while (!converged){ + while (!all_optimized && iters_num < max_iter){ D = 0; S = 0; + all_optimized = true; - alpha = 0; - for (auto alpha_m_k: alpha_m){ - alpha += alpha_m_k; - } + alpha = get_alpha(alpha_m); count_d n, c_d; for (auto kv: docs_counts){ @@ -220,14 +230,24 @@ class digamma_rec: public dirichlet_prior_opt{ n = kv_.first; c_k_n = kv_.second; - D += 1/(n - 1 + alpha * m_k); + D += 1/(n - 1 + alpha_m[k]); S_k += c_k_n * D; } - alpha_m[k] *= S_k / S; + alpha_mk_new = alpha_m[k] * S_k / S; + + if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ + all_optimized = false; + } + + alpha_m[k] *= alpha_mk_new; } + + iters_num++; } + mu_ = get_alpha(alpha_m); + } }; diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp index b85fdbfca..756ae45cf 100644 --- a/src/index/ranker/test_opt/test.cpp +++ b/src/index/ranker/test_opt/test.cpp @@ -45,22 +45,22 @@ int main(int argc, char* argv[]) std::cout << "Method DR took: " << time.count() / 1000.0 << " seconds" << std::endl; - time = common::time([&]() - { - // Create and make score of optimizer - index::log_approx ranker; - std::cout << ranker.get_optimized_mu(*idx) << std::endl; - }); +// time = common::time([&]() +// { +// // Create and make score of optimizer +// index::log_approx ranker; +// std::cout << ranker.get_optimized_mu(*idx) << std::endl; +// }); - std::cout << "Method LA took: " << time.count() / 1000.0 - << " seconds" << std::endl; +// std::cout << "Method LA took: " << time.count() / 1000.0 +// << " seconds" << std::endl; - time = common::time([&]() - { - // Create and make score of optimizer - index::mackay_peto ranker; - std::cout << ranker.get_optimized_mu(*idx) << std::endl; - }); +// time = common::time([&]() +// { +// // Create and make score of optimizer +// index::mackay_peto ranker; +// std::cout << ranker.get_optimized_mu(*idx) << std::endl; +// }); std::cout << "Method MP took: " << time.count() / 1000.0 << " seconds" << std::endl; From 312a485f45c1d64012e4de3ed8f401b6551d9199 Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Thu, 30 Nov 2017 05:27:03 +0300 Subject: [PATCH 18/30] [opt] * method works --- include/meta/index/ranker/dirichlet_prior.h | 52 ++++++++++++++------- src/index/ranker/test_opt/test.cpp | 4 +- 2 files changed, 38 insertions(+), 18 deletions(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index 702d28638..1f914950f 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -15,6 +15,8 @@ #include #include +using namespace std; + namespace meta { namespace index @@ -75,6 +77,10 @@ class dirichlet_prior : public language_model_ranker float mu_; }; + +// # TODO: choose template type instead of long +typedef long count_d; + struct docs_data { // general info @@ -84,6 +90,8 @@ struct docs_data std::vector doc_ids; /// ids of all terms std::vector term_ids; + /// total size of documents + count_d ref_size; /** * Constructor to initialize most elements. @@ -91,10 +99,11 @@ struct docs_data * @param p_doc_ids ids of all docs * @param p_term_ids ids of all terms */ - docs_data(const inverted_index& p_idx, std::vector p_doc_ids, std::vector p_term_ids) + docs_data(const inverted_index& p_idx, std::vector p_doc_ids, std::vector p_term_ids, count_d p_ref_size) : idx(p_idx), // gcc no non-const ref init from brace init list doc_ids{p_doc_ids}, - term_ids{p_term_ids} + term_ids{p_term_ids}, + ref_size{p_ref_size} { /* nothing */ } @@ -113,8 +122,8 @@ class dirichlet_prior_opt : public dirichlet_prior{ return ranker::score(idx, begin, end, num_results); } - float get_optimized_mu(const inverted_index& idx) { - optimize_mu(idx); + float get_optimized_mu(const inverted_index& idx, float eps=1e-6, int max_iter=100) { + optimize_mu(idx, eps, max_iter); return mu_; } @@ -131,12 +140,17 @@ class dirichlet_prior_opt : public dirichlet_prior{ } private: - void optimize_mu(const inverted_index& idx) { + void optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=100) { auto docs_ids = idx.docs(); auto terms_ids = idx.terms(); - docs_data dd{idx, docs_ids, terms_ids}; - optimize_mu(dd); + count_d ref_size = 0; + for (auto& id : docs_ids) + ref_size += idx.doc_size(id); + + docs_data dd{idx, docs_ids, terms_ids, ref_size}; + + optimize_mu(dd, eps, max_iter); // std::cout << idx.unique_terms() << std::endl; // for (auto d_id: docs_ids){ @@ -154,10 +168,7 @@ class dirichlet_prior_opt : public dirichlet_prior{ virtual void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) = 0; }; -// # TODO: choose template type instead of long -typedef long count_d; - -class digamma_rec: public dirichlet_prior_opt{ +class digamma_rec_opt: public dirichlet_prior_opt{ void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override { // fill C_.(n) and C_k(n) @@ -191,21 +202,25 @@ class digamma_rec: public dirichlet_prior_opt{ double alpha = 1, alpha_mk_new; std::map alpha_m; + cout << "Start alpha: "; for (auto t_id: dd.idx.terms()){ alpha_m[t_id] = dd.idx.total_num_occurences(t_id) * alpha; + alpha_m[t_id] /= (double)dd.ref_size; + cout << alpha_m[t_id] << " "; } double D, S; bool all_optimized = false; - int iters_num = 0; + int iter_num = 0; - while (!all_optimized && iters_num < max_iter){ + while (!all_optimized && iter_num < max_iter){ D = 0; S = 0; all_optimized = true; alpha = get_alpha(alpha_m); + cout << "\nIter " << iter_num << " alpha = " << alpha; count_d n, c_d; for (auto kv: docs_counts){ n = kv.first; @@ -243,7 +258,12 @@ class digamma_rec: public dirichlet_prior_opt{ alpha_m[k] *= alpha_mk_new; } - iters_num++; + cout << "\nVector alpha_m after the iter: "; + for (auto kv: alpha_m){ + cout << " " << kv.second; + } + + iter_num++; } mu_ = get_alpha(alpha_m); @@ -251,11 +271,11 @@ class digamma_rec: public dirichlet_prior_opt{ } }; -class log_approx: public dirichlet_prior_opt{ +class log_approx_opt: public dirichlet_prior_opt{ // void optimize_mu(const inverted_index& idx) override { mu_ = 0;}; }; -class mackay_peto: public dirichlet_prior_opt{ +class mackay_peto_opt: public dirichlet_prior_opt{ // void optimize_mu(const inverted_index& idx) override { mu_ = 0;}; }; diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp index 756ae45cf..ff2afc130 100644 --- a/src/index/ranker/test_opt/test.cpp +++ b/src/index/ranker/test_opt/test.cpp @@ -38,8 +38,8 @@ int main(int argc, char* argv[]) auto time = common::time([&]() { // Create and make score of optimizer - index::digamma_rec ranker; - std::cout << ranker.get_optimized_mu(*idx) << std::endl; + index::digamma_rec_opt ranker; + std::cout << ranker.get_optimized_mu(*idx, 1e-6, 100) << std::endl; }); std::cout << "Method DR took: " << time.count() / 1000.0 From b60cc54a7549120280ad63855d17136fc1003d32 Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Thu, 30 Nov 2017 18:20:00 +0300 Subject: [PATCH 19/30] [opt] *first method debugged --- include/meta/index/ranker/dirichlet_prior.h | 81 ++++++++++++++------- src/index/ranker/test_opt/test.cpp | 4 +- 2 files changed, 58 insertions(+), 27 deletions(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index 1f914950f..1133ad941 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -168,7 +168,7 @@ class dirichlet_prior_opt : public dirichlet_prior{ virtual void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) = 0; }; -class digamma_rec_opt: public dirichlet_prior_opt{ +class dirichlet_digamma_rec: public dirichlet_prior_opt{ void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override { // fill C_.(n) and C_k(n) @@ -176,20 +176,45 @@ class digamma_rec_opt: public dirichlet_prior_opt{ std::map> terms_docs_counts; long doc_size, doc_term_freq; + cout << "Docs and terms:\n"; for (auto d_id: dd.doc_ids){ doc_size = dd.idx.doc_size(d_id); //// increase number of docs with the given size (C_.(n)) docs_counts[doc_size] += 1; - for (auto t_id: dd.idx.terms(d_id)){ + cout << d_id << " " << doc_size << " " << docs_counts[doc_size] << endl; + for (auto t_id: dd.term_ids){ doc_term_freq = dd.idx.term_freq(t_id, d_id); //// increase number of docs with the given count of word t_id (C_k(n)) terms_docs_counts[t_id][doc_term_freq] += 1; + + cout << " " << t_id << " " << doc_term_freq << " " << terms_docs_counts[t_id][doc_term_freq] << endl; } } + cout << "\nDocuments_ids count: " << dd.doc_ids.size() << "; Terms ids count: " << dd.term_ids.size() << endl; + + cout << "\nDocuments sizes frequency:\n"; + for (auto kv: docs_counts){ + cout << kv.first << " " << kv.second << endl; + } + + int occur_sum, freq_sum; + cout << "\nTerms frequency in each doc:\n"; + for (auto kv: terms_docs_counts){ + occur_sum = 0; + freq_sum = 0; + cout << dd.idx.total_num_occurences(kv.first) << " " << kv.first << endl; + for (auto kv_: kv.second){ + occur_sum += kv_.second; + freq_sum += kv_.first * kv_.second; + cout << " " << kv_.first << " " << kv_.second << endl; + } + cout << " " << freq_sum << " " << occur_sum << " total occurences" << endl; + } + // // sort by ascending of occurences // std::sort(docs_counts.begin(), items.end()); // for (auto key: terms_docs_counts){ @@ -199,11 +224,11 @@ class digamma_rec_opt: public dirichlet_prior_opt{ // p(w|REF) = dd.idx.total_num_occurences(t_id) // fill start vector alpha_m - double alpha = 1, alpha_mk_new; + double alpha = 2000.0, alpha_mk_new; std::map alpha_m; - cout << "Start alpha: "; - for (auto t_id: dd.idx.terms()){ + cout << "\nStart alpha: "; + for (auto t_id: dd.term_ids){ alpha_m[t_id] = dd.idx.total_num_occurences(t_id) * alpha; alpha_m[t_id] /= (double)dd.ref_size; cout << alpha_m[t_id] << " "; @@ -213,49 +238,55 @@ class digamma_rec_opt: public dirichlet_prior_opt{ bool all_optimized = false; int iter_num = 0; + double n_max = docs_counts.rbegin()->first; + cout << "\n n_max=" << n_max << endl; + while (!all_optimized && iter_num < max_iter){ - D = 0; - S = 0; + D = 0.0; + S = 0.0; all_optimized = true; alpha = get_alpha(alpha_m); cout << "\nIter " << iter_num << " alpha = " << alpha; - count_d n, c_d; - for (auto kv: docs_counts){ - n = kv.first; - c_d = kv.second; + count_d c_d; + for (count_d n = 1; n <= n_max; n++){ + c_d = docs_counts[n]; - D += 1/(n - 1 + alpha); + D += 1.0/(n - 1 + alpha); S += c_d * D; } - std::map c_n; + cout << "\nD = " << D << "; S = " << S << "; S_k = "; + + std::map c_k; term_id k; double S_k; for (auto kv: terms_docs_counts){ k = kv.first; - c_n = kv.second; + c_k = kv.second; - D = 0; - S_k = 0; + D = 0.0; + S_k = 0.0; - count_d n, c_k_n; - for (auto kv_: c_n){ - n = kv_.first; - c_k_n = kv_.second; + count_d c_k_n, n_k_max = c_k.rbegin()->first; + cout << "\n n_k_max=" << n_k_max << endl; + for (count_d n = 1; n <= n_k_max; n++){ + c_k_n = c_k[n]; - D += 1/(n - 1 + alpha_m[k]); + D += 1.0/(n - 1 + alpha_m[k]); S_k += c_k_n * D; } + cout << S_k << " "; + alpha_mk_new = alpha_m[k] * S_k / S; if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ all_optimized = false; } - alpha_m[k] *= alpha_mk_new; + alpha_m[k] = alpha_mk_new; } cout << "\nVector alpha_m after the iter: "; @@ -267,15 +298,15 @@ class digamma_rec_opt: public dirichlet_prior_opt{ } mu_ = get_alpha(alpha_m); - + cout << endl << mu_ << endl; } }; -class log_approx_opt: public dirichlet_prior_opt{ +class dirichlet_log_approx: public dirichlet_prior_opt{ // void optimize_mu(const inverted_index& idx) override { mu_ = 0;}; }; -class mackay_peto_opt: public dirichlet_prior_opt{ +class dirichlet_mackay_peto: public dirichlet_prior_opt{ // void optimize_mu(const inverted_index& idx) override { mu_ = 0;}; }; diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp index ff2afc130..bd69cfba7 100644 --- a/src/index/ranker/test_opt/test.cpp +++ b/src/index/ranker/test_opt/test.cpp @@ -38,8 +38,8 @@ int main(int argc, char* argv[]) auto time = common::time([&]() { // Create and make score of optimizer - index::digamma_rec_opt ranker; - std::cout << ranker.get_optimized_mu(*idx, 1e-6, 100) << std::endl; + index::dirichlet_digamma_rec ranker; + std::cout << "\n\n" << ranker.get_optimized_mu(*idx, 1e-6, 100) << std::endl; }); std::cout << "Method DR took: " << time.count() / 1000.0 From 0a0851cc524926b83d68583068b176f4e463894b Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Thu, 30 Nov 2017 18:44:29 +0300 Subject: [PATCH 20/30] [opt] method refactored --- include/meta/index/ranker/dirichlet_prior.h | 136 +++++++------------- src/index/ranker/test_opt/test.cpp | 2 +- 2 files changed, 46 insertions(+), 92 deletions(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index 1133ad941..ef04fc681 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -13,9 +13,6 @@ #include "meta/index/ranker/ranker_factory.h" #include -#include - -using namespace std; namespace meta { @@ -92,6 +89,12 @@ struct docs_data std::vector term_ids; /// total size of documents count_d ref_size; + /// C_.(n) + std::map docs_counts; + /// C_k(n) + std::map> terms_docs_counts; + /// vector alpha_m + std::map alpha_m; /** * Constructor to initialize most elements. @@ -99,11 +102,16 @@ struct docs_data * @param p_doc_ids ids of all docs * @param p_term_ids ids of all terms */ - docs_data(const inverted_index& p_idx, std::vector p_doc_ids, std::vector p_term_ids, count_d p_ref_size) + docs_data(const inverted_index& p_idx, std::vector p_doc_ids, std::vector p_term_ids, count_d p_ref_size, + std::map p_docs_counts, std::map> p_terms_docs_counts, + std::map p_alpha_m) : idx(p_idx), // gcc no non-const ref init from brace init list doc_ids{p_doc_ids}, term_ids{p_term_ids}, - ref_size{p_ref_size} + ref_size{p_ref_size}, + docs_counts{p_docs_counts}, + terms_docs_counts{p_terms_docs_counts}, + alpha_m{p_alpha_m} { /* nothing */ } @@ -122,7 +130,7 @@ class dirichlet_prior_opt : public dirichlet_prior{ return ranker::score(idx, begin, end, num_results); } - float get_optimized_mu(const inverted_index& idx, float eps=1e-6, int max_iter=100) { + float get_optimized_mu(const inverted_index& idx, float eps, int max_iter) { optimize_mu(idx, eps, max_iter); return mu_; @@ -140,106 +148,64 @@ class dirichlet_prior_opt : public dirichlet_prior{ } private: - void optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=100) { + void optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) { + // parse idx and extract what we need auto docs_ids = idx.docs(); auto terms_ids = idx.terms(); + // calculate ref_size count_d ref_size = 0; for (auto& id : docs_ids) ref_size += idx.doc_size(id); - docs_data dd{idx, docs_ids, terms_ids, ref_size}; - - optimize_mu(dd, eps, max_iter); -// std::cout << idx.unique_terms() << std::endl; - -// for (auto d_id: docs_ids){ -// for (auto t_id: terms_ids){ -// std::cout << idx.term_freq(t_id, d_id) << std::endl; -// } -// } - -// optimize_mu(std::vector docs_ids, -// idx.unique_terms() -// idx.total_corpus_terms() - - } - - virtual void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) = 0; -}; - -class dirichlet_digamma_rec: public dirichlet_prior_opt{ - void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override { - // fill C_.(n) and C_k(n) - + // calculate C_.(n) and C_k(n) std::map docs_counts; std::map> terms_docs_counts; - long doc_size, doc_term_freq; - cout << "Docs and terms:\n"; - for (auto d_id: dd.doc_ids){ - doc_size = dd.idx.doc_size(d_id); + long doc_size, doc_term_freq; + for (auto d_id: docs_ids){ + doc_size = idx.doc_size(d_id); //// increase number of docs with the given size (C_.(n)) docs_counts[doc_size] += 1; - cout << d_id << " " << doc_size << " " << docs_counts[doc_size] << endl; - for (auto t_id: dd.term_ids){ - doc_term_freq = dd.idx.term_freq(t_id, d_id); + for (auto t_id: terms_ids){ + doc_term_freq = idx.term_freq(t_id, d_id); //// increase number of docs with the given count of word t_id (C_k(n)) terms_docs_counts[t_id][doc_term_freq] += 1; - - cout << " " << t_id << " " << doc_term_freq << " " << terms_docs_counts[t_id][doc_term_freq] << endl; } } - cout << "\nDocuments_ids count: " << dd.doc_ids.size() << "; Terms ids count: " << dd.term_ids.size() << endl; - - cout << "\nDocuments sizes frequency:\n"; - for (auto kv: docs_counts){ - cout << kv.first << " " << kv.second << endl; - } + // fill start vector alpha_m + std::map alpha_m; - int occur_sum, freq_sum; - cout << "\nTerms frequency in each doc:\n"; - for (auto kv: terms_docs_counts){ - occur_sum = 0; - freq_sum = 0; - cout << dd.idx.total_num_occurences(kv.first) << " " << kv.first << endl; - for (auto kv_: kv.second){ - occur_sum += kv_.second; - freq_sum += kv_.first * kv_.second; - cout << " " << kv_.first << " " << kv_.second << endl; - } - cout << " " << freq_sum << " " << occur_sum << " total occurences" << endl; + for (auto t_id: terms_ids){ + alpha_m[t_id] = idx.total_num_occurences(t_id) * default_mu; + alpha_m[t_id] /= (double)ref_size; } -// // sort by ascending of occurences -// std::sort(docs_counts.begin(), items.end()); -// for (auto key: terms_docs_counts){ -// std::sort(key.second.begin(), key.second.end()); -// } + // create docs_data + docs_data dd{idx, docs_ids, terms_ids, ref_size, docs_counts, terms_docs_counts, alpha_m}; - // p(w|REF) = dd.idx.total_num_occurences(t_id) + // call optimizer + optimize_mu(dd, eps, max_iter); + } - // fill start vector alpha_m - double alpha = 2000.0, alpha_mk_new; - std::map alpha_m; + virtual void optimize_mu(docs_data& dd, float eps, int max_iter) = 0; +}; - cout << "\nStart alpha: "; - for (auto t_id: dd.term_ids){ - alpha_m[t_id] = dd.idx.total_num_occurences(t_id) * alpha; - alpha_m[t_id] /= (double)dd.ref_size; - cout << alpha_m[t_id] << " "; - } +class dirichlet_digamma_rec: public dirichlet_prior_opt{ + void optimize_mu(docs_data& dd, float eps, int max_iter) override { - double D, S; bool all_optimized = false; int iter_num = 0; + double D, S; + double n_max = dd.docs_counts.rbegin()->first; - double n_max = docs_counts.rbegin()->first; - cout << "\n n_max=" << n_max << endl; + // start values for alpha and alpha_m + double alpha = default_mu, alpha_mk_new; + std::map alpha_m = dd.alpha_m; while (!all_optimized && iter_num < max_iter){ D = 0.0; @@ -248,21 +214,18 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{ alpha = get_alpha(alpha_m); - cout << "\nIter " << iter_num << " alpha = " << alpha; count_d c_d; for (count_d n = 1; n <= n_max; n++){ - c_d = docs_counts[n]; + c_d = dd.docs_counts[n]; D += 1.0/(n - 1 + alpha); S += c_d * D; } - cout << "\nD = " << D << "; S = " << S << "; S_k = "; - - std::map c_k; term_id k; + std::map c_k; double S_k; - for (auto kv: terms_docs_counts){ + for (auto kv: dd.terms_docs_counts){ k = kv.first; c_k = kv.second; @@ -270,7 +233,6 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{ S_k = 0.0; count_d c_k_n, n_k_max = c_k.rbegin()->first; - cout << "\n n_k_max=" << n_k_max << endl; for (count_d n = 1; n <= n_k_max; n++){ c_k_n = c_k[n]; @@ -278,8 +240,6 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{ S_k += c_k_n * D; } - cout << S_k << " "; - alpha_mk_new = alpha_m[k] * S_k / S; if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ @@ -289,16 +249,10 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{ alpha_m[k] = alpha_mk_new; } - cout << "\nVector alpha_m after the iter: "; - for (auto kv: alpha_m){ - cout << " " << kv.second; - } - iter_num++; } mu_ = get_alpha(alpha_m); - cout << endl << mu_ << endl; } }; diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp index bd69cfba7..39f80c198 100644 --- a/src/index/ranker/test_opt/test.cpp +++ b/src/index/ranker/test_opt/test.cpp @@ -39,7 +39,7 @@ int main(int argc, char* argv[]) { // Create and make score of optimizer index::dirichlet_digamma_rec ranker; - std::cout << "\n\n" << ranker.get_optimized_mu(*idx, 1e-6, 100) << std::endl; + std::cout << "\n\n" << ranker.get_optimized_mu(*idx, 1e-6, 10000) << std::endl; }); std::cout << "Method DR took: " << time.count() / 1000.0 From d726f708b16985a47a2b2e5248f9f77dcd464a30 Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Thu, 30 Nov 2017 19:05:46 +0300 Subject: [PATCH 21/30] [opt] + method2 --- include/meta/index/ranker/dirichlet_prior.h | 60 ++++++++++++++++++++- src/index/ranker/test_opt/test.cpp | 29 +++++----- 2 files changed, 74 insertions(+), 15 deletions(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index ef04fc681..6b99671d8 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -197,7 +197,6 @@ class dirichlet_prior_opt : public dirichlet_prior{ class dirichlet_digamma_rec: public dirichlet_prior_opt{ void optimize_mu(docs_data& dd, float eps, int max_iter) override { - bool all_optimized = false; int iter_num = 0; double D, S; @@ -257,7 +256,64 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{ }; class dirichlet_log_approx: public dirichlet_prior_opt{ -// void optimize_mu(const inverted_index& idx) override { mu_ = 0;}; + void optimize_mu(docs_data& dd, float eps, int max_iter) override { + bool all_optimized = false; + int iter_num = 0; + double S, S_k; + double n_max = dd.docs_counts.rbegin()->first; + + // start values for alpha and alpha_m + double alpha = default_mu, alpha_mk_new; + std::map alpha_m = dd.alpha_m; + + while (!all_optimized && iter_num < max_iter){ + S = 0.0; + all_optimized = true; + + alpha = get_alpha(alpha_m); + + count_d c_d; + // TODO: skip the zero docs counts + for (count_d n = 1; n <= n_max; n++){ + c_d = dd.docs_counts[n]; + + if (c_d != 0){ + S += c_d * (1.0/alpha + log(n + alpha - 0.5) - log(alpha + 0.5)); + } + } + + term_id k; + std::map c_k; + for (auto kv: dd.terms_docs_counts){ + k = kv.first; + c_k = kv.second; + + S_k = 0.0; + + count_d c_k_n, n_k_max = c_k.rbegin()->first; + // TODO: skip the zero docs counts + for (count_d n = 1; n <= n_k_max; n++){ + c_k_n = c_k[n]; + + if (c_k_n != 0){ + S_k += c_k_n * (1.0/alpha_m[k] + log(n + alpha_m[k] - 0.5) - log(alpha_m[k] + 0.5)); + } + } + + alpha_mk_new = alpha_m[k] * S_k / S; + + if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ + all_optimized = false; + } + + alpha_m[k] = alpha_mk_new; + } + + iter_num++; + } + + mu_ = get_alpha(alpha_m); + } }; class dirichlet_mackay_peto: public dirichlet_prior_opt{ diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp index 39f80c198..5c5beb624 100644 --- a/src/index/ranker/test_opt/test.cpp +++ b/src/index/ranker/test_opt/test.cpp @@ -33,27 +33,30 @@ int main(int argc, char* argv[]) auto config = cpptoml::parse_file(argv[1]); auto idx = index::make_index(*config); + double eps = 1e-6; + int iters = 10000; + // Time how long it takes to create the index. By default, common::time's // unit of measurement is milliseconds. auto time = common::time([&]() { // Create and make score of optimizer index::dirichlet_digamma_rec ranker; - std::cout << "\n\n" << ranker.get_optimized_mu(*idx, 1e-6, 10000) << std::endl; + std::cout << "\n\n" << ranker.get_optimized_mu(*idx, eps, iters) << std::endl; }); - std::cout << "Method DR took: " << time.count() / 1000.0 - << " seconds" << std::endl; + std::cout << "Method DR took: " << time.count() / 1.0 + << " milliseconds" << std::endl; -// time = common::time([&]() -// { -// // Create and make score of optimizer -// index::log_approx ranker; -// std::cout << ranker.get_optimized_mu(*idx) << std::endl; -// }); + time = common::time([&]() + { + // Create and make score of optimizer + index::dirichlet_log_approx ranker; + std::cout << ranker.get_optimized_mu(*idx, eps, iters) << std::endl; + }); -// std::cout << "Method LA took: " << time.count() / 1000.0 -// << " seconds" << std::endl; + std::cout << "Method LA took: " << time.count() / 1.0 + << " milliseconds" << std::endl; // time = common::time([&]() // { @@ -62,8 +65,8 @@ int main(int argc, char* argv[]) // std::cout << ranker.get_optimized_mu(*idx) << std::endl; // }); - std::cout << "Method MP took: " << time.count() / 1000.0 - << " seconds" << std::endl; + std::cout << "Method MP took: " << time.count() / 1.0 + << " milliseconds" << std::endl; return 0; } From 4a6a240e55063abc895453d5c67a30f0b430184a Mon Sep 17 00:00:00 2001 From: M Date: Thu, 30 Nov 2017 19:10:48 +0300 Subject: [PATCH 22/30] Adding constructors and register for new ranker classes --- include/meta/index/ranker/dirichlet_prior.h | 57 ++++++++++++- src/index/ranker/dirichlet_prior.cpp | 90 +++++++++++++++++++++ 2 files changed, 145 insertions(+), 2 deletions(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index 1133ad941..57e14841a 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -111,6 +111,11 @@ struct docs_data class dirichlet_prior_opt : public dirichlet_prior{ public: + + dirichlet_prior_opt(float mu) : dirichlet_prior(mu) { } + + dirichlet_prior_opt(std::istream& in) : dirichlet_prior(in) { } + template std::vector score(inverted_index& idx, ForwardIterator begin, ForwardIterator end, @@ -169,6 +174,22 @@ class dirichlet_prior_opt : public dirichlet_prior{ }; class dirichlet_digamma_rec: public dirichlet_prior_opt{ +public: + const static util::string_view id; + + /** + * @param mu + */ + dirichlet_digamma_rec(float mu = default_mu); + + /** + * Loads a dirichlet_prior ranker from a stream. + * @param in The stream to read from + */ + dirichlet_digamma_rec(std::istream& in); + + void save(std::ostream& out) const override; +private: void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override { // fill C_.(n) and C_k(n) @@ -303,11 +324,43 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{ }; class dirichlet_log_approx: public dirichlet_prior_opt{ -// void optimize_mu(const inverted_index& idx) override { mu_ = 0;}; +public: + const static util::string_view id; + + /** + * @param mu + */ + dirichlet_log_approx(float mu = default_mu); + + /** + * Loads a dirichlet_prior ranker from a stream. + * @param in The stream to read from + */ + dirichlet_log_approx(std::istream& in); + + void save(std::ostream& out) const override; +private: + void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override { mu_ = 0;}; }; class dirichlet_mackay_peto: public dirichlet_prior_opt{ -// void optimize_mu(const inverted_index& idx) override { mu_ = 0;}; +public: + const static util::string_view id; + + /** + * @param mu + */ + dirichlet_mackay_peto(float mu = default_mu); + + /** + * Loads a dirichlet_prior ranker from a stream. + * @param in The stream to read from + */ + dirichlet_mackay_peto(std::istream& in); + + void save(std::ostream& out) const override; +private: + void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override { mu_ = 0;}; }; /** diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp index 07536afbe..9cf0fed10 100644 --- a/src/index/ranker/dirichlet_prior.cpp +++ b/src/index/ranker/dirichlet_prior.cpp @@ -55,5 +55,95 @@ std::unique_ptr throw ranker_exception{"dirichlet-prior mu must be >= 0"}; return make_unique(mu); } + +const util::string_view dirichlet_digamma_rec::id = "dirichlet-digamma-rec"; +template <> +std::unique_ptr + make_ranker(const cpptoml::table& config) +{ + auto mu = config.get_as("mu").value_or(dirichlet_digamma_rec::default_mu); + if (mu < 0) + throw ranker_exception{"dirichlet-digamma-rec mu must be >= 0"}; + return make_unique(mu); +} + +dirichlet_digamma_rec::dirichlet_digamma_rec(float mu) : dirichlet_prior_opt(mu) +{ + // nothing +} + +dirichlet_digamma_rec::dirichlet_digamma_rec(std::istream& in) + : dirichlet_prior_opt(in) +{ + // nothing +} + +void dirichlet_digamma_rec::save(std::ostream& out) const +{ + io::packed::write(out, id); + + io::packed::write(out, mu_); +} + +const util::string_view dirichlet_log_approx::id = "dirichlet-log-approx"; +template <> +std::unique_ptr + make_ranker(const cpptoml::table& config) +{ + auto mu = config.get_as("mu").value_or(dirichlet_log_approx::default_mu); + if (mu < 0) + throw ranker_exception{"dirichlet-log-approx mu must be >= 0"}; + return make_unique(mu); +} + + +dirichlet_log_approx::dirichlet_log_approx(float mu) : dirichlet_prior_opt(mu) +{ + // nothing +} + +dirichlet_log_approx::dirichlet_log_approx(std::istream& in) + : dirichlet_prior_opt(in) +{ + // nothing +} + +void dirichlet_log_approx::save(std::ostream& out) const +{ + io::packed::write(out, id); + + io::packed::write(out, mu_); +} + +const util::string_view dirichlet_mackay_peto::id = "dirichlet-mackay-peto"; +template <> +std::unique_ptr + make_ranker(const cpptoml::table& config) +{ + auto mu = config.get_as("mu").value_or(dirichlet_mackay_peto::default_mu); + if (mu < 0) + throw ranker_exception{"dirichlet-mackay-peto mu must be >= 0"}; + return make_unique(mu); +} + + +dirichlet_mackay_peto::dirichlet_mackay_peto(float mu) : dirichlet_prior_opt(mu) +{ + // nothing +} + +dirichlet_mackay_peto::dirichlet_mackay_peto(std::istream& in) + : dirichlet_prior_opt(in) +{ + // nothing +} + +void dirichlet_mackay_peto::save(std::ostream& out) const +{ + io::packed::write(out, id); + + io::packed::write(out, mu_); +} + } } From bc948ce7133d9a06738e7699b342506d2119e4c3 Mon Sep 17 00:00:00 2001 From: M Date: Thu, 30 Nov 2017 20:51:37 +0300 Subject: [PATCH 23/30] Add rankers to factory --- include/meta/index/ranker/dirichlet_prior.h | 21 +++++++++++++++++++++ src/index/ranker/ranker_factory.cpp | 3 +++ 2 files changed, 24 insertions(+) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index bd6dac611..2f0e33901 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -384,6 +384,27 @@ class dirichlet_mackay_peto: public dirichlet_prior_opt{ */ template <> std::unique_ptr make_ranker(const cpptoml::table&); + +/** + * Specialization of the factory method used to create dirichlet_digamma_rec + * rankers. + */ +template <> +std::unique_ptr make_ranker(const cpptoml::table&); + +/** + * Specialization of the factory method used to create dirichlet_log_approx + * rankers. + */ +template <> +std::unique_ptr make_ranker(const cpptoml::table&); + +/** + * Specialization of the factory method used to create dirichlet_mackay_peto + * rankers. + */ +template <> +std::unique_ptr make_ranker(const cpptoml::table&); } } #endif diff --git a/src/index/ranker/ranker_factory.cpp b/src/index/ranker/ranker_factory.cpp index 86c1069af..0643b0742 100644 --- a/src/index/ranker/ranker_factory.cpp +++ b/src/index/ranker/ranker_factory.cpp @@ -31,6 +31,9 @@ ranker_factory::ranker_factory() reg(); reg(); reg(); + reg(); + reg(); + reg(); } std::unique_ptr make_ranker(const cpptoml::table& config) From 78d6d5c8ae3f4170a152524438a5b5fad6ff99bf Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Thu, 30 Nov 2017 20:58:58 +0300 Subject: [PATCH 24/30] [opt] + benchmark --- include/meta/index/ranker/dirichlet_prior.h | 30 ++++++++++------ src/index/ranker/test_opt/test.cpp | 39 ++++++++++++--------- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index bd6dac611..8fdd52672 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -68,6 +68,10 @@ class dirichlet_prior : public language_model_ranker */ float doc_constant(const score_data& sd) const override; + float parameter() const { + return mu_; + } + protected: /// the Dirichlet prior parameter // const float mu_; @@ -135,10 +139,8 @@ class dirichlet_prior_opt : public dirichlet_prior{ return ranker::score(idx, begin, end, num_results); } - float get_optimized_mu(const inverted_index& idx, float eps, int max_iter) { - optimize_mu(idx, eps, max_iter); - - return mu_; + std::map get_optimized_mu(const inverted_index& idx, float eps, int max_iter) { + return optimize_mu(idx, eps, max_iter); } protected: @@ -153,7 +155,7 @@ class dirichlet_prior_opt : public dirichlet_prior{ } private: - void optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) { + std::map optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) { // parse idx and extract what we need auto docs_ids = idx.docs(); auto terms_ids = idx.terms(); @@ -194,10 +196,10 @@ class dirichlet_prior_opt : public dirichlet_prior{ docs_data dd{idx, docs_ids, terms_ids, ref_size, docs_counts, terms_docs_counts, alpha_m}; // call optimizer - optimize_mu(dd, eps, max_iter); + return optimize_mu(dd, eps, max_iter); } - virtual void optimize_mu(docs_data& dd, float eps, int max_iter) = 0; + virtual std::map optimize_mu(docs_data& dd, float eps, int max_iter) = 0; }; class dirichlet_digamma_rec: public dirichlet_prior_opt{ @@ -217,7 +219,7 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{ void save(std::ostream& out) const override; private: - void optimize_mu(docs_data& dd, float eps, int max_iter) override { + std::map optimize_mu(docs_data& dd, float eps, int max_iter) override { bool all_optimized = false; int iter_num = 0; double D, S; @@ -273,7 +275,10 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{ } mu_ = get_alpha(alpha_m); + + return alpha_m; } + }; class dirichlet_log_approx: public dirichlet_prior_opt{ @@ -293,7 +298,7 @@ class dirichlet_log_approx: public dirichlet_prior_opt{ void save(std::ostream& out) const override; private: - void optimize_mu(docs_data& dd, float eps, int max_iter) override { + std::map optimize_mu(docs_data& dd, float eps, int max_iter) override { bool all_optimized = false; int iter_num = 0; double S, S_k; @@ -350,6 +355,8 @@ class dirichlet_log_approx: public dirichlet_prior_opt{ } mu_ = get_alpha(alpha_m); + + return alpha_m; } }; @@ -370,11 +377,14 @@ class dirichlet_mackay_peto: public dirichlet_prior_opt{ void save(std::ostream& out) const override; private: - void optimize_mu(docs_data& dd, float eps, int max_iter) override { + std::map optimize_mu(docs_data& dd, float eps, int max_iter) override { eps = eps; max_iter = max_iter; eps = dd.ref_size; mu_ = 0; + std::map alpha_m; + + return alpha_m; } }; diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp index 5c5beb624..054952104 100644 --- a/src/index/ranker/test_opt/test.cpp +++ b/src/index/ranker/test_opt/test.cpp @@ -13,6 +13,13 @@ using namespace meta; +void display_result(float alpha, std::map alpha_m, float time){ + for (auto kv: alpha_m){ + std::cout << kv.second << " "; + } + std::cout << alpha << std::endl << time << std::endl; +} + int main(int argc, char* argv[]) { if (argc != 2) @@ -36,27 +43,28 @@ int main(int argc, char* argv[]) double eps = 1e-6; int iters = 10000; - // Time how long it takes to create the index. By default, common::time's - // unit of measurement is milliseconds. - auto time = common::time([&]() + float alpha; + std::map alpha_m; + + index::dirichlet_digamma_rec ranker1; + index::dirichlet_log_approx ranker2; + + auto time1 = common::time([&]() { - // Create and make score of optimizer - index::dirichlet_digamma_rec ranker; - std::cout << "\n\n" << ranker.get_optimized_mu(*idx, eps, iters) << std::endl; + alpha_m = ranker1.get_optimized_mu(*idx, eps, iters); + alpha = ranker1.parameter(); }); - std::cout << "Method DR took: " << time.count() / 1.0 - << " milliseconds" << std::endl; + display_result(alpha, alpha_m, time1.count() / 1.0); - time = common::time([&]() + auto time2 = common::time([&]() { - // Create and make score of optimizer - index::dirichlet_log_approx ranker; - std::cout << ranker.get_optimized_mu(*idx, eps, iters) << std::endl; + alpha_m = ranker2.get_optimized_mu(*idx, eps, iters); + alpha = ranker2.parameter(); }); - std::cout << "Method LA took: " << time.count() / 1.0 - << " milliseconds" << std::endl; + display_result(alpha, alpha_m, time2.count() / 1.0); + // time = common::time([&]() // { @@ -65,8 +73,5 @@ int main(int argc, char* argv[]) // std::cout << ranker.get_optimized_mu(*idx) << std::endl; // }); - std::cout << "Method MP took: " << time.count() / 1.0 - << " milliseconds" << std::endl; - return 0; } From 5bc6ee68e162ff4a4aae626a2fb3d9c5932b9c80 Mon Sep 17 00:00:00 2001 From: M Date: Thu, 30 Nov 2017 21:37:49 +0300 Subject: [PATCH 25/30] Minor fix foor output --- src/index/ranker/test_opt/test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp index 054952104..2c27f2427 100644 --- a/src/index/ranker/test_opt/test.cpp +++ b/src/index/ranker/test_opt/test.cpp @@ -17,7 +17,7 @@ void display_result(float alpha, std::map alpha_m, float time){ for (auto kv: alpha_m){ std::cout << kv.second << " "; } - std::cout << alpha << std::endl << time << std::endl; + std::cout << std::endl << alpha << std::endl << time << std::endl; } int main(int argc, char* argv[]) From 4f8fa1d59e3dc9f862a491a08b744ed9a789fd8b Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Fri, 1 Dec 2017 02:21:48 +0300 Subject: [PATCH 26/30] [opt] + dirichlet_opt files --- include/meta/index/ranker/all.h | 1 + include/meta/index/ranker/dirichlet_prior.h | 331 -------------------- src/index/ranker/CMakeLists.txt | 1 + src/index/ranker/dirichlet_prior.cpp | 89 ------ 4 files changed, 2 insertions(+), 420 deletions(-) diff --git a/include/meta/index/ranker/all.h b/include/meta/index/ranker/all.h index 8a1fe0e04..3b3c1efcf 100644 --- a/include/meta/index/ranker/all.h +++ b/include/meta/index/ranker/all.h @@ -1,6 +1,7 @@ #include "meta/index/ranker/ranker.h" #include "meta/index/ranker/absolute_discount.h" #include "meta/index/ranker/dirichlet_prior.h" +#include "meta/index/ranker/dirichlet_prior_opt.h" #include "meta/index/ranker/jelinek_mercer.h" #include "meta/index/ranker/lm_ranker.h" #include "meta/index/ranker/okapi_bm25.h" diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index cfe782b1a..6f2456f8a 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -12,8 +12,6 @@ #include "meta/index/ranker/lm_ranker.h" #include "meta/index/ranker/ranker_factory.h" -#include - namespace meta { namespace index @@ -79,315 +77,6 @@ class dirichlet_prior : public language_model_ranker }; -// # TODO: choose template type instead of long -typedef long count_d; - -struct docs_data -{ - // general info - - const inverted_index& idx; - /// ids of all documents - std::vector doc_ids; - /// ids of all terms - std::vector term_ids; - /// total size of documents - count_d ref_size; - /// C_.(n) - std::map docs_counts; - /// C_k(n) - std::map> terms_docs_counts; - /// vector alpha_m - std::map alpha_m; - - /** - * Constructor to initialize most elements. - * @param p_idx The index that is being used - * @param p_doc_ids ids of all docs - * @param p_term_ids ids of all terms - */ - docs_data(const inverted_index& p_idx, std::vector p_doc_ids, std::vector p_term_ids, count_d p_ref_size, - std::map p_docs_counts, std::map> p_terms_docs_counts, - std::map p_alpha_m) - : idx(p_idx), // gcc no non-const ref init from brace init list - doc_ids{p_doc_ids}, - term_ids{p_term_ids}, - ref_size{p_ref_size}, - docs_counts{p_docs_counts}, - terms_docs_counts{p_terms_docs_counts}, - alpha_m{p_alpha_m} - { - /* nothing */ - } -}; - -class dirichlet_prior_opt : public dirichlet_prior{ -public: - - dirichlet_prior_opt(float mu) : dirichlet_prior(mu) { } - - dirichlet_prior_opt(std::istream& in) : dirichlet_prior(in) { } - - template - std::vector score(inverted_index& idx, ForwardIterator begin, - ForwardIterator end, - uint64_t num_results = 10) - { - // optimize mu according to ranker_context before ranking - this->optimize_mu(idx); - - return ranker::score(idx, begin, end, num_results); - } - - std::map get_optimized_mu(const inverted_index& idx, float eps, int max_iter) { - return optimize_mu(idx, eps, max_iter); - } - -protected: - inline double get_alpha(std::map alpha_m){ - double alpha = 0; - - for (auto alpha_m_k: alpha_m){ - alpha += alpha_m_k.second; - } - - return alpha; - } - -private: - std::map optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) { - // parse idx and extract what we need - auto docs_ids = idx.docs(); - auto terms_ids = idx.terms(); - - // calculate ref_size - count_d ref_size = 0; - for (auto& id : docs_ids) - ref_size += idx.doc_size(id); - - // calculate C_.(n) and C_k(n) - std::map docs_counts; - std::map> terms_docs_counts; - - long doc_size, doc_term_freq; - for (auto d_id: docs_ids){ - doc_size = idx.doc_size(d_id); - - //// increase number of docs with the given size (C_.(n)) - docs_counts[doc_size] += 1; - - for (auto t_id: terms_ids){ - doc_term_freq = idx.term_freq(t_id, d_id); - - //// increase number of docs with the given count of word t_id (C_k(n)) - terms_docs_counts[t_id][doc_term_freq] += 1; - } - } - - // fill start vector alpha_m - std::map alpha_m; - - for (auto t_id: terms_ids){ - alpha_m[t_id] = idx.total_num_occurences(t_id) * default_mu; - alpha_m[t_id] /= (double)ref_size; - } - - // create docs_data - docs_data dd{idx, docs_ids, terms_ids, ref_size, docs_counts, terms_docs_counts, alpha_m}; - - // call optimizer - return optimize_mu(dd, eps, max_iter); - } - - virtual std::map optimize_mu(docs_data& dd, float eps, int max_iter) = 0; -}; - -class dirichlet_digamma_rec: public dirichlet_prior_opt{ -public: - const static util::string_view id; - - /** - * @param mu - */ - dirichlet_digamma_rec(float mu = default_mu); - - /** - * Loads a dirichlet_prior ranker from a stream. - * @param in The stream to read from - */ - dirichlet_digamma_rec(std::istream& in); - - void save(std::ostream& out) const override; -private: - std::map optimize_mu(docs_data& dd, float eps, int max_iter) override { - bool all_optimized = false; - int iter_num = 0; - double D, S; - double n_max = dd.docs_counts.rbegin()->first; - - // start values for alpha and alpha_m - double alpha = default_mu, alpha_mk_new; - std::map alpha_m = dd.alpha_m; - - while (!all_optimized && iter_num < max_iter){ - D = 0.0; - S = 0.0; - all_optimized = true; - - alpha = get_alpha(alpha_m); - - count_d c_d; - for (count_d n = 1; n <= n_max; n++){ - c_d = dd.docs_counts[n]; - - D += 1.0/(n - 1 + alpha); - S += c_d * D; - } - - term_id k; - std::map c_k; - double S_k; - for (auto kv: dd.terms_docs_counts){ - k = kv.first; - c_k = kv.second; - - D = 0.0; - S_k = 0.0; - - count_d c_k_n, n_k_max = c_k.rbegin()->first; - for (count_d n = 1; n <= n_k_max; n++){ - c_k_n = c_k[n]; - - D += 1.0/(n - 1 + alpha_m[k]); - S_k += c_k_n * D; - } - - alpha_mk_new = alpha_m[k] * S_k / S; - - if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ - all_optimized = false; - } - - alpha_m[k] = alpha_mk_new; - } - - iter_num++; - } - - mu_ = get_alpha(alpha_m); - - return alpha_m; - } - -}; - -class dirichlet_log_approx: public dirichlet_prior_opt{ -public: - const static util::string_view id; - - /** - * @param mu - */ - dirichlet_log_approx(float mu = default_mu); - - /** - * Loads a dirichlet_prior ranker from a stream. - * @param in The stream to read from - */ - dirichlet_log_approx(std::istream& in); - - void save(std::ostream& out) const override; -private: - std::map optimize_mu(docs_data& dd, float eps, int max_iter) override { - bool all_optimized = false; - int iter_num = 0; - double S, S_k; - double n_max = dd.docs_counts.rbegin()->first; - - // start values for alpha and alpha_m - double alpha = default_mu, alpha_mk_new; - std::map alpha_m = dd.alpha_m; - - while (!all_optimized && iter_num < max_iter){ - S = 0.0; - all_optimized = true; - - alpha = get_alpha(alpha_m); - - count_d c_d; - // TODO: skip the zero docs counts - for (count_d n = 1; n <= n_max; n++){ - c_d = dd.docs_counts[n]; - - if (c_d != 0){ - S += c_d * (1.0/alpha + log(n + alpha - 0.5) - log(alpha + 0.5)); - } - } - - term_id k; - std::map c_k; - for (auto kv: dd.terms_docs_counts){ - k = kv.first; - c_k = kv.second; - - S_k = 0.0; - - count_d c_k_n, n_k_max = c_k.rbegin()->first; - // TODO: skip the zero docs counts - for (count_d n = 1; n <= n_k_max; n++){ - c_k_n = c_k[n]; - - if (c_k_n != 0){ - S_k += c_k_n * (1.0/alpha_m[k] + log(n + alpha_m[k] - 0.5) - log(alpha_m[k] + 0.5)); - } - } - - alpha_mk_new = alpha_m[k] * S_k / S; - - if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ - all_optimized = false; - } - - alpha_m[k] = alpha_mk_new; - } - - iter_num++; - } - - mu_ = get_alpha(alpha_m); - - return alpha_m; - } -}; - -class dirichlet_mackay_peto: public dirichlet_prior_opt{ -public: - const static util::string_view id; - - /** - * @param mu - */ - dirichlet_mackay_peto(float mu = default_mu); - - /** - * Loads a dirichlet_prior ranker from a stream. - * @param in The stream to read from - */ - dirichlet_mackay_peto(std::istream& in); - - void save(std::ostream& out) const override; -private: - std::map optimize_mu(docs_data& dd, float eps, int max_iter) override { - eps = eps; - max_iter = max_iter; - eps = dd.ref_size; - mu_ = 0; - std::map alpha_m; - - return alpha_m; - } -}; - /** * Specialization of the factory method used to create dirichlet_prior * rankers. @@ -395,26 +84,6 @@ class dirichlet_mackay_peto: public dirichlet_prior_opt{ template <> std::unique_ptr make_ranker(const cpptoml::table&); -/** - * Specialization of the factory method used to create dirichlet_digamma_rec - * rankers. - */ -template <> -std::unique_ptr make_ranker(const cpptoml::table&); - -/** - * Specialization of the factory method used to create dirichlet_log_approx - * rankers. - */ -template <> -std::unique_ptr make_ranker(const cpptoml::table&); - -/** - * Specialization of the factory method used to create dirichlet_mackay_peto - * rankers. - */ -template <> -std::unique_ptr make_ranker(const cpptoml::table&); } } #endif diff --git a/src/index/ranker/CMakeLists.txt b/src/index/ranker/CMakeLists.txt index e386d54b6..f0e699418 100644 --- a/src/index/ranker/CMakeLists.txt +++ b/src/index/ranker/CMakeLists.txt @@ -4,6 +4,7 @@ add_subdirectory(test_opt) add_library(meta-ranker absolute_discount.cpp dirichlet_prior.cpp + dirichlet_prior_opt.cpp jelinek_mercer.cpp lm_ranker.cpp okapi_bm25.cpp diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp index 9cf0fed10..3a43297d6 100644 --- a/src/index/ranker/dirichlet_prior.cpp +++ b/src/index/ranker/dirichlet_prior.cpp @@ -56,94 +56,5 @@ std::unique_ptr return make_unique(mu); } -const util::string_view dirichlet_digamma_rec::id = "dirichlet-digamma-rec"; -template <> -std::unique_ptr - make_ranker(const cpptoml::table& config) -{ - auto mu = config.get_as("mu").value_or(dirichlet_digamma_rec::default_mu); - if (mu < 0) - throw ranker_exception{"dirichlet-digamma-rec mu must be >= 0"}; - return make_unique(mu); -} - -dirichlet_digamma_rec::dirichlet_digamma_rec(float mu) : dirichlet_prior_opt(mu) -{ - // nothing -} - -dirichlet_digamma_rec::dirichlet_digamma_rec(std::istream& in) - : dirichlet_prior_opt(in) -{ - // nothing -} - -void dirichlet_digamma_rec::save(std::ostream& out) const -{ - io::packed::write(out, id); - - io::packed::write(out, mu_); -} - -const util::string_view dirichlet_log_approx::id = "dirichlet-log-approx"; -template <> -std::unique_ptr - make_ranker(const cpptoml::table& config) -{ - auto mu = config.get_as("mu").value_or(dirichlet_log_approx::default_mu); - if (mu < 0) - throw ranker_exception{"dirichlet-log-approx mu must be >= 0"}; - return make_unique(mu); -} - - -dirichlet_log_approx::dirichlet_log_approx(float mu) : dirichlet_prior_opt(mu) -{ - // nothing -} - -dirichlet_log_approx::dirichlet_log_approx(std::istream& in) - : dirichlet_prior_opt(in) -{ - // nothing -} - -void dirichlet_log_approx::save(std::ostream& out) const -{ - io::packed::write(out, id); - - io::packed::write(out, mu_); -} - -const util::string_view dirichlet_mackay_peto::id = "dirichlet-mackay-peto"; -template <> -std::unique_ptr - make_ranker(const cpptoml::table& config) -{ - auto mu = config.get_as("mu").value_or(dirichlet_mackay_peto::default_mu); - if (mu < 0) - throw ranker_exception{"dirichlet-mackay-peto mu must be >= 0"}; - return make_unique(mu); -} - - -dirichlet_mackay_peto::dirichlet_mackay_peto(float mu) : dirichlet_prior_opt(mu) -{ - // nothing -} - -dirichlet_mackay_peto::dirichlet_mackay_peto(std::istream& in) - : dirichlet_prior_opt(in) -{ - // nothing -} - -void dirichlet_mackay_peto::save(std::ostream& out) const -{ - io::packed::write(out, id); - - io::packed::write(out, mu_); -} - } } From c8ddfbfdd9eb620e35b2e21d665289b33d19b828 Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Fri, 1 Dec 2017 02:24:45 +0300 Subject: [PATCH 27/30] [opt] + dirichlet_prior_opt --- .../meta/index/ranker/dirichlet_prior_opt.h | 241 +++++++++++++++++ src/index/ranker/dirichlet_prior_opt.cpp | 242 ++++++++++++++++++ 2 files changed, 483 insertions(+) create mode 100644 include/meta/index/ranker/dirichlet_prior_opt.h create mode 100644 src/index/ranker/dirichlet_prior_opt.cpp diff --git a/include/meta/index/ranker/dirichlet_prior_opt.h b/include/meta/index/ranker/dirichlet_prior_opt.h new file mode 100644 index 000000000..face18029 --- /dev/null +++ b/include/meta/index/ranker/dirichlet_prior_opt.h @@ -0,0 +1,241 @@ +/** + * @file dirichlet_prior_opt.h + * @author Aleksey Marashov, Kolomiets Maxim + * + * All files in META are released under the MIT license. For more details, + * consult the file LICENSE in the root of the project. + */ + +#ifndef META_DIRICHLET_PRIOR_OPT_H_ +#define META_DIRICHLET_PRIOR_OPT_H_ + +#include "meta/index/ranker/dirichlet_prior.h" + +#include + +namespace meta +{ +namespace index +{ + +// # TODO: choose template type instead of long +typedef long count_d; + +struct docs_data +{ + // general info + + const inverted_index& idx; + /// ids of all documents + std::vector doc_ids; + /// ids of all terms + std::vector term_ids; + /// total size of documents + count_d ref_size; + /// C_.(n) + std::map docs_counts; + /// C_k(n) + std::map> terms_docs_counts; + /// vector alpha_m + std::map alpha_m; + + /** + * Constructor to initialize most elements. + * @param p_idx The index that is being used + * @param p_doc_ids ids of all docs + * @param p_term_ids ids of all terms + */ + docs_data(const inverted_index& p_idx, std::vector p_doc_ids, std::vector p_term_ids, count_d p_ref_size, + std::map p_docs_counts, std::map> p_terms_docs_counts, + std::map p_alpha_m) + : idx(p_idx), // gcc no non-const ref init from brace init list + doc_ids{p_doc_ids}, + term_ids{p_term_ids}, + ref_size{p_ref_size}, + docs_counts{p_docs_counts}, + terms_docs_counts{p_terms_docs_counts}, + alpha_m{p_alpha_m} + { + /* nothing */ + } +}; + + +/** + * Implements Bayesian smoothing with a Dirichlet prior. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "dirichlet-prior" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * mu = 2000.0 + * ~~~ + */ +class dirichlet_prior_opt : public dirichlet_prior{ +public: + dirichlet_prior_opt(float mu) : dirichlet_prior(mu) { } + + dirichlet_prior_opt(std::istream& in) : dirichlet_prior(in) { } + + template + std::vector score(inverted_index& idx, ForwardIterator begin, + ForwardIterator end, + uint64_t num_results = 10) + { + this->optimize_mu(idx); + + return ranker::score(idx, begin, end, num_results); + } + + std::map get_optimized_mu(const inverted_index& idx, float eps, int max_iter) { + return optimize_mu(idx, eps, max_iter); + } + +protected: + inline double get_alpha(std::map alpha_m){ + double alpha = 0; + + for (auto alpha_m_k: alpha_m){ + alpha += alpha_m_k.second; + } + + return alpha; + } + +private: + std::map optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) { + // parse idx and extract what we need + + auto docs_ids = idx.docs(); + auto terms_ids = idx.terms(); + + // calculate ref_size + count_d ref_size = 0; + for (auto& id : docs_ids) + ref_size += idx.doc_size(id); + + // calculate C_.(n) and C_k(n) + std::map docs_counts; + std::map> terms_docs_counts; + + long doc_size, doc_term_freq; + for (auto d_id: docs_ids){ + doc_size = idx.doc_size(d_id); + + //// increase number of docs with the given size (C_.(n)) + docs_counts[doc_size] += 1; + + for (auto t_id: terms_ids){ + doc_term_freq = idx.term_freq(t_id, d_id); + + //// increase number of docs with the given count of word t_id (C_k(n)) + terms_docs_counts[t_id][doc_term_freq] += 1; + } + } + + // fill start vector alpha_m + std::map alpha_m; + + for (auto t_id: terms_ids){ + alpha_m[t_id] = idx.total_num_occurences(t_id) * default_mu; + alpha_m[t_id] /= (double)ref_size; + } + + // create docs_data + docs_data dd{idx, docs_ids, terms_ids, ref_size, docs_counts, terms_docs_counts, alpha_m}; + + // call optimizer + return optimize_mu(dd, eps, max_iter); + } + + virtual std::map optimize_mu(docs_data& dd, float eps, int max_iter) = 0; +}; + +class dirichlet_digamma_rec: public dirichlet_prior_opt{ +public: + const static util::string_view id; + + /** + * @param mu + */ + dirichlet_digamma_rec(float mu = default_mu); + + /** + * Loads a dirichlet_prior ranker from a stream. + * @param in The stream to read from + */ + dirichlet_digamma_rec(std::istream& in); + + void save(std::ostream& out) const override; +private: + std::map optimize_mu(docs_data& dd, float eps, int max_iter) override; + +}; + +class dirichlet_log_approx: public dirichlet_prior_opt{ +public: + const static util::string_view id; + + /** + * @param mu + */ + dirichlet_log_approx(float mu = default_mu); + + /** + * Loads a dirichlet_prior ranker from a stream. + * @param in The stream to read from + */ + dirichlet_log_approx(std::istream& in); + + void save(std::ostream& out) const override; +private: + std::map optimize_mu(docs_data& dd, float eps, int max_iter) override; +}; + +class dirichlet_mackay_peto: public dirichlet_prior_opt{ +public: + const static util::string_view id; + + /** + * @param mu + */ + dirichlet_mackay_peto(float mu = default_mu); + + /** + * Loads a dirichlet_prior ranker from a stream. + * @param in The stream to read from + */ + dirichlet_mackay_peto(std::istream& in); + + void save(std::ostream& out) const override; +private: + std::map optimize_mu(docs_data& dd, float eps, int max_iter) override; +}; + +/** + * Specialization of the factory method used to create dirichlet_digamma_rec + * rankers. + */ +template <> +std::unique_ptr make_ranker(const cpptoml::table&); + +/** + * Specialization of the factory method used to create dirichlet_log_approx + * rankers. + */ +template <> +std::unique_ptr make_ranker(const cpptoml::table&); + +/** + * Specialization of the factory method used to create dirichlet_mackay_peto + * rankers. + */ +template <> +std::unique_ptr make_ranker(const cpptoml::table&); +} +} +#endif diff --git a/src/index/ranker/dirichlet_prior_opt.cpp b/src/index/ranker/dirichlet_prior_opt.cpp new file mode 100644 index 000000000..2a2df1ce5 --- /dev/null +++ b/src/index/ranker/dirichlet_prior_opt.cpp @@ -0,0 +1,242 @@ +/** + * @file dirichlet_prior_opt.cpp + * @author Aleksey Marashov, Kolomiets Maksim + */ + +#include "cpptoml.h" +#include "meta/index/ranker/dirichlet_prior_opt.h" +#include "meta/index/score_data.h" + +namespace meta +{ +namespace index +{ + +// makers + +const util::string_view dirichlet_digamma_rec::id = "dirichlet-digamma-rec"; +template <> +std::unique_ptr + make_ranker(const cpptoml::table& config) +{ + auto mu = config.get_as("mu").value_or(dirichlet_digamma_rec::default_mu); + if (mu < 0) + throw ranker_exception{"dirichlet-digamma-rec mu must be >= 0"}; + return make_unique(mu); +} + +const util::string_view dirichlet_log_approx::id = "dirichlet-log-approx"; +template <> +std::unique_ptr + make_ranker(const cpptoml::table& config) +{ + auto mu = config.get_as("mu").value_or(dirichlet_log_approx::default_mu); + if (mu < 0) + throw ranker_exception{"dirichlet-log-approx mu must be >= 0"}; + return make_unique(mu); +} + +const util::string_view dirichlet_mackay_peto::id = "dirichlet-mackay-peto"; +template <> +std::unique_ptr + make_ranker(const cpptoml::table& config) +{ + auto mu = config.get_as("mu").value_or(dirichlet_mackay_peto::default_mu); + if (mu < 0) + throw ranker_exception{"dirichlet-mackay-peto mu must be >= 0"}; + return make_unique(mu); +} + +// constructors + +dirichlet_digamma_rec::dirichlet_digamma_rec(float mu) : dirichlet_prior_opt(mu) +{ + // nothing +} + +dirichlet_digamma_rec::dirichlet_digamma_rec(std::istream& in) + : dirichlet_prior_opt(in) +{ + // nothing +} + +void dirichlet_digamma_rec::save(std::ostream& out) const +{ + io::packed::write(out, id); + + io::packed::write(out, mu_); +} + + +dirichlet_log_approx::dirichlet_log_approx(float mu) : dirichlet_prior_opt(mu) +{ + // nothing +} + +dirichlet_log_approx::dirichlet_log_approx(std::istream& in) + : dirichlet_prior_opt(in) +{ + // nothing +} + +void dirichlet_log_approx::save(std::ostream& out) const +{ + io::packed::write(out, id); + + io::packed::write(out, mu_); +} + + +dirichlet_mackay_peto::dirichlet_mackay_peto(float mu) : dirichlet_prior_opt(mu) +{ + // nothing +} + +dirichlet_mackay_peto::dirichlet_mackay_peto(std::istream& in) + : dirichlet_prior_opt(in) +{ + // nothing +} + +void dirichlet_mackay_peto::save(std::ostream& out) const +{ + io::packed::write(out, id); + + io::packed::write(out, mu_); +} + +// optimization methods + +std::map dirichlet_digamma_rec::optimize_mu(docs_data& dd, float eps, int max_iter) { + bool all_optimized = false; + int iter_num = 0; + double D, S; + double n_max = dd.docs_counts.rbegin()->first; + + // start values for alpha and alpha_m + double alpha = default_mu, alpha_mk_new; + std::map alpha_m = dd.alpha_m; + + while (!all_optimized && iter_num < max_iter){ + D = 0.0; + S = 0.0; + all_optimized = true; + + alpha = get_alpha(alpha_m); + + count_d c_d; + for (count_d n = 1; n <= n_max; n++){ + c_d = dd.docs_counts[n]; + + D += 1.0/(n - 1 + alpha); + S += c_d * D; + } + + term_id k; + std::map c_k; + double S_k; + for (auto kv: dd.terms_docs_counts){ + k = kv.first; + c_k = kv.second; + + D = 0.0; + S_k = 0.0; + + count_d c_k_n, n_k_max = c_k.rbegin()->first; + for (count_d n = 1; n <= n_k_max; n++){ + c_k_n = c_k[n]; + + D += 1.0/(n - 1 + alpha_m[k]); + S_k += c_k_n * D; + } + + alpha_mk_new = alpha_m[k] * S_k / S; + + if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ + all_optimized = false; + } + + alpha_m[k] = alpha_mk_new; + } + + iter_num++; + } + + mu_ = get_alpha(alpha_m); + + return alpha_m; +} + +std::map dirichlet_log_approx::optimize_mu(docs_data& dd, float eps, int max_iter) { + bool all_optimized = false; + int iter_num = 0; + double S, S_k; + double n_max = dd.docs_counts.rbegin()->first; + + // start values for alpha and alpha_m + double alpha = default_mu, alpha_mk_new; + std::map alpha_m = dd.alpha_m; + + while (!all_optimized && iter_num < max_iter){ + S = 0.0; + all_optimized = true; + + alpha = get_alpha(alpha_m); + + count_d c_d; + // TODO: skip the zero docs counts + for (count_d n = 1; n <= n_max; n++){ + c_d = dd.docs_counts[n]; + + if (c_d != 0){ + S += c_d * (1.0/alpha + log(n + alpha - 0.5) - log(alpha + 0.5)); + } + } + + term_id k; + std::map c_k; + for (auto kv: dd.terms_docs_counts){ + k = kv.first; + c_k = kv.second; + + S_k = 0.0; + + count_d c_k_n, n_k_max = c_k.rbegin()->first; + // TODO: skip the zero docs counts + for (count_d n = 1; n <= n_k_max; n++){ + c_k_n = c_k[n]; + + if (c_k_n != 0){ + S_k += c_k_n * (1.0/alpha_m[k] + log(n + alpha_m[k] - 0.5) - log(alpha_m[k] + 0.5)); + } + } + + alpha_mk_new = alpha_m[k] * S_k / S; + + if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ + all_optimized = false; + } + + alpha_m[k] = alpha_mk_new; + } + + iter_num++; + } + + mu_ = get_alpha(alpha_m); + + return alpha_m; +} + +std::map dirichlet_mackay_peto::optimize_mu(docs_data& dd, float eps, int max_iter) { + eps = eps; + max_iter = max_iter; + eps = dd.ref_size; + mu_ = 0; + std::map alpha_m; + + return alpha_m; +} + +} +} From f7b634a1f1bde37ff4f3043efd2ae373ea335d77 Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Fri, 1 Dec 2017 03:17:25 +0300 Subject: [PATCH 28/30] [opt] + MacKay and Peto method --- src/index/ranker/dirichlet_prior_opt.cpp | 59 ++++++++++++++++++++++-- src/index/ranker/test_opt/test.cpp | 14 +++--- 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/src/index/ranker/dirichlet_prior_opt.cpp b/src/index/ranker/dirichlet_prior_opt.cpp index 2a2df1ce5..4405719c4 100644 --- a/src/index/ranker/dirichlet_prior_opt.cpp +++ b/src/index/ranker/dirichlet_prior_opt.cpp @@ -229,11 +229,60 @@ std::map dirichlet_log_approx::optimize_mu(docs_data& dd, float } std::map dirichlet_mackay_peto::optimize_mu(docs_data& dd, float eps, int max_iter) { - eps = eps; - max_iter = max_iter; - eps = dd.ref_size; - mu_ = 0; - std::map alpha_m; + bool all_optimized = false; + int iter_num = 0; + + // start values for alpha and alpha_m + double alpha = default_mu, alpha_mk_new; + std::map alpha_m = dd.alpha_m; + + while (!all_optimized && iter_num < max_iter){ + all_optimized = true; + + alpha = get_alpha(alpha_m); + + // compute K(alpha) + double K_alpha = 0; + for (auto d_id: dd.doc_ids){ + double n_d = dd.idx.doc_size(d_id); + K_alpha += log((n_d + alpha) / alpha) + 0.5 * n_d / (alpha * (n_d + alpha)); + } + + term_id k; + std::map c_k; + for (auto kv: dd.terms_docs_counts){ + k = kv.first; + c_k = kv.second; + + count_d n_k_max = c_k.rbegin()->first; + + // compute V_k + count_d V_k = dd.idx.doc_freq(k); + + // compute H_k and G_k + double H_k = 0, G_k = 0; + count_d N_f = 0; + for (count_d f = n_k_max; f >= 2; f--){ + N_f += c_k[f]; + + G_k += (double) N_f / (f - 1.0); + H_k += (double) N_f / pow(f - 1.0, 2); + } + + // recompute alpha_mk + alpha_mk_new = 2 * V_k / (K_alpha - G_k + sqrt(pow(K_alpha - G_k, 2) + 4 * H_k * V_k)); + + if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ + all_optimized = false; + } + + alpha_m[k] = alpha_mk_new; + } + + iter_num++; + } + + mu_ = get_alpha(alpha_m); return alpha_m; } diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp index 2c27f2427..a12771f53 100644 --- a/src/index/ranker/test_opt/test.cpp +++ b/src/index/ranker/test_opt/test.cpp @@ -48,6 +48,7 @@ int main(int argc, char* argv[]) index::dirichlet_digamma_rec ranker1; index::dirichlet_log_approx ranker2; + index::dirichlet_mackay_peto ranker3; auto time1 = common::time([&]() { @@ -66,12 +67,13 @@ int main(int argc, char* argv[]) display_result(alpha, alpha_m, time2.count() / 1.0); -// time = common::time([&]() -// { -// // Create and make score of optimizer -// index::mackay_peto ranker; -// std::cout << ranker.get_optimized_mu(*idx) << std::endl; -// }); + auto time3 = common::time([&]() + { + alpha_m = ranker3.get_optimized_mu(*idx, eps, iters); + alpha = ranker3.parameter(); + }); + + display_result(alpha, alpha_m, time3.count() / 1.0); return 0; } From d4b0a8d9f50dcd6dc1b09b96a2867838345e2660 Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Sun, 3 Dec 2017 14:22:33 +0300 Subject: [PATCH 29/30] [opt] + comments and docs --- .../meta/index/ranker/dirichlet_prior_opt.h | 84 +++++++++++++++---- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/include/meta/index/ranker/dirichlet_prior_opt.h b/include/meta/index/ranker/dirichlet_prior_opt.h index face18029..351b01b2a 100644 --- a/include/meta/index/ranker/dirichlet_prior_opt.h +++ b/include/meta/index/ranker/dirichlet_prior_opt.h @@ -18,19 +18,17 @@ namespace meta namespace index { -// # TODO: choose template type instead of long typedef long count_d; struct docs_data { - // general info - + /// inverted index const inverted_index& idx; - /// ids of all documents + /// ids of all documents in the index std::vector doc_ids; - /// ids of all terms + /// ids of all terms in the index std::vector term_ids; - /// total size of documents + /// total size of all documents count_d ref_size; /// C_.(n) std::map docs_counts; @@ -62,18 +60,10 @@ struct docs_data /** - * Implements Bayesian smoothing with a Dirichlet prior. + * Abstract class for Diriclhet prior smoothing with optimized constant mu. + * Constant mu is optimized at the stage of scoring documents using information about those documents. * - * Required config parameters: - * ~~~toml - * [ranker] - * method = "dirichlet-prior" - * ~~~ - * - * Optional config parameters: - * ~~~toml - * mu = 2000.0 - * ~~~ + * Virtual method optimize_mu(docs_data& dd, float eps, int max_iter) is needed to be overrided in inheritants. */ class dirichlet_prior_opt : public dirichlet_prior{ public: @@ -86,6 +76,7 @@ class dirichlet_prior_opt : public dirichlet_prior{ ForwardIterator end, uint64_t num_results = 10) { + // optimize mu before scoring this->optimize_mu(idx); return ranker::score(idx, begin, end, num_results); @@ -107,13 +98,24 @@ class dirichlet_prior_opt : public dirichlet_prior{ } private: + /** + * Extracts information necessary to find optimal mu and wrap it into docs_data. + * Then, calls class-specific realization of optimize_mu function. + * Found optimal value of mu is written to the member of the class. + * + * @param idx inverted index + * @param eps convergence precision + * @param max_iter maximal number of iterations (upper bound) + * + * @return optimal value [alpha * m_i] for each term + */ std::map optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) { // parse idx and extract what we need auto docs_ids = idx.docs(); auto terms_ids = idx.terms(); - // calculate ref_size + // calculate total size of all documents count_d ref_size = 0; for (auto& id : docs_ids) ref_size += idx.doc_size(id); @@ -152,9 +154,31 @@ class dirichlet_prior_opt : public dirichlet_prior{ return optimize_mu(dd, eps, max_iter); } + /** + * Finds optimal mu using information from given docs_data structure. + * Writes optimal mu to the corresponding field of the class. + * + * @param idx inverted index + * @param eps convergence precision + * @param max_iter maximal number of iterations (upper bound) + * + * @return optimal value [alpha * m_i] for each term + */ virtual std::map optimize_mu(docs_data& dd, float eps, int max_iter) = 0; }; +/** + * Implements Diriclhet Prior smoothing with optimized constant mu. + * + * Optimization method is Fixed-Point Iteration with digamma recurrence relation + * described at: https://people.cs.umass.edu/~wallach/theses/wallach_phd_thesis.pdf, pp. 27-28. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "dirichlet-digamma-rec" + * ~~~ + */ class dirichlet_digamma_rec: public dirichlet_prior_opt{ public: const static util::string_view id; @@ -176,6 +200,18 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{ }; +/** + * Implements Diriclhet Prior smoothing with optimized constant mu. + * + * Optimization method is Fixed-Point Iteration with digamma differences log approximation + * described at: https://people.cs.umass.edu/~wallach/theses/wallach_phd_thesis.pdf, pp. 28-29. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "dirichlet-log-approx" + * ~~~ + */ class dirichlet_log_approx: public dirichlet_prior_opt{ public: const static util::string_view id; @@ -196,6 +232,18 @@ class dirichlet_log_approx: public dirichlet_prior_opt{ std::map optimize_mu(docs_data& dd, float eps, int max_iter) override; }; +/** + * Implements Diriclhet Prior smoothing with optimized constant mu. + * + * Optimization method is MacKay and Peto's Fixed-Point Iteration with efficiently computing N_fk + * described at: https://people.cs.umass.edu/~wallach/theses/wallach_phd_thesis.pdf, p. 30. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "dirichlet-mackay-peto" + * ~~~ + */ class dirichlet_mackay_peto: public dirichlet_prior_opt{ public: const static util::string_view id; From 001fac6cecbe2b73b5db5888f4056460423f1ac1 Mon Sep 17 00:00:00 2001 From: Alex2304 Date: Mon, 4 Dec 2017 12:51:49 +0300 Subject: [PATCH 30/30] [opt] - test files --- include/meta/stats/statistics.h | 102 ----------------------- src/index/ranker/CMakeLists.txt | 4 +- src/index/ranker/dirichlet_prior.cpp | 1 - src/index/ranker/test_opt/CMakeLists.txt | 10 --- src/index/ranker/test_opt/test.cpp | 79 ------------------ 5 files changed, 1 insertion(+), 195 deletions(-) delete mode 100644 src/index/ranker/test_opt/CMakeLists.txt delete mode 100644 src/index/ranker/test_opt/test.cpp diff --git a/include/meta/stats/statistics.h b/include/meta/stats/statistics.h index 168af1245..ac1fb054e 100644 --- a/include/meta/stats/statistics.h +++ b/include/meta/stats/statistics.h @@ -18,108 +18,6 @@ namespace meta { namespace stats { - -#ifndef M_PIl -/** The constant Pi in high precision */ -#define M_PIl 3.1415926535897932384626433832795029L -#endif -#ifndef M_GAMMAl -/** Euler's constant in high precision */ -#define M_GAMMAl 0.5772156649015328606065120900824024L -#endif -#ifndef M_LN2l -/** the natural logarithm of 2 in high precision */ -#define M_LN2l 0.6931471805599453094172321214581766L -#endif - -/** The digamma function in long double precision. -* @param x the real value of the argument -* @return the value of the digamma (psi) function at that point -* @author Richard J. Mathar -* @since 2005-11-24 -*/ -long double digamma(long double x) -{ - /* force into the interval 1..3 */ - if( x < 0.0L ) - return digamma(1.0L-x)+M_PIl/tanl(M_PIl*(1.0L-x)) ; /* reflection formula */ - else if( x < 1.0L ) - return digamma(1.0L+x)-1.0L/x ; - else if ( x == 1.0L) - return -M_GAMMAl ; - else if ( x == 2.0L) - return 1.0L-M_GAMMAl ; - else if ( x == 3.0L) - return 1.5L-M_GAMMAl ; - else if ( x > 3.0L) - /* duplication formula */ - return 0.5L*(digamma(x/2.0L)+digamma((x+1.0L)/2.0L))+M_LN2l ; - else - { - /* Just for your information, the following lines contain - * the Maple source code to re-generate the table that is - * eventually becoming the Kncoe[] array below - * interface(prettyprint=0) : - * Digits := 63 : - * r := 0 : - * - * for l from 1 to 60 do - * d := binomial(-1/2,l) : - * r := r+d*(-1)^l*(Zeta(2*l+1) -1) ; - * evalf(r) ; - * print(%,evalf(1+Psi(1)-r)) ; - *o d : - * - * for N from 1 to 28 do - * r := 0 : - * n := N-1 : - * - * for l from iquo(n+3,2) to 70 do - * d := 0 : - * for s from 0 to n+1 do - * d := d+(-1)^s*binomial(n+1,s)*binomial((s-1)/2,l) : - * od : - * if 2*l-n > 1 then - * r := r+d*(-1)^l*(Zeta(2*l-n) -1) : - * fi : - * od : - * print(evalf((-1)^n*2*r)) ; - *od : - *quit : - */ - static long double Kncoe[] = { .30459198558715155634315638246624251L, - .72037977439182833573548891941219706L, -.12454959243861367729528855995001087L, - .27769457331927827002810119567456810e-1L, -.67762371439822456447373550186163070e-2L, - .17238755142247705209823876688592170e-2L, -.44817699064252933515310345718960928e-3L, - .11793660000155572716272710617753373e-3L, -.31253894280980134452125172274246963e-4L, - .83173997012173283398932708991137488e-5L, -.22191427643780045431149221890172210e-5L, - .59302266729329346291029599913617915e-6L, -.15863051191470655433559920279603632e-6L, - .42459203983193603241777510648681429e-7L, -.11369129616951114238848106591780146e-7L, - .304502217295931698401459168423403510e-8L, -.81568455080753152802915013641723686e-9L, - .21852324749975455125936715817306383e-9L, -.58546491441689515680751900276454407e-10L, - .15686348450871204869813586459513648e-10L, -.42029496273143231373796179302482033e-11L, - .11261435719264907097227520956710754e-11L, -.30174353636860279765375177200637590e-12L, - .80850955256389526647406571868193768e-13L, -.21663779809421233144009565199997351e-13L, - .58047634271339391495076374966835526e-14L, -.15553767189204733561108869588173845e-14L, - .41676108598040807753707828039353330e-15L, -.11167065064221317094734023242188463e-15L } ; - - register long double Tn_1 = 1.0L ; /* T_{n-1}(x), started at n=1 */ - register long double Tn = x-2.0L ; /* T_{n}(x) , started at n=1 */ - register long double resul = Kncoe[0] + Kncoe[1]*Tn ; - - x -= 2.0L ; - - for(int n = 2 ; n < sizeof(Kncoe)/sizeof(long double) ;n++) - { - const long double Tn1 = 2.0L * x * Tn - Tn_1 ; /* Chebyshev recursion, Eq. 22.7.4 Abramowitz-Stegun */ - resul += Kncoe[n]*Tn1 ; - Tn_1 = Tn ; - Tn = Tn1 ; - } - return resul ; - } -} - /** * Computation for \f$E_d[f(x)]\f$ where \f$d\f$ is specified by the * `dist` parameter and \f$f(x)\f$ is the `fun` parameter. diff --git a/src/index/ranker/CMakeLists.txt b/src/index/ranker/CMakeLists.txt index f0e699418..84d22701d 100644 --- a/src/index/ranker/CMakeLists.txt +++ b/src/index/ranker/CMakeLists.txt @@ -1,10 +1,8 @@ project(meta-ranker) -add_subdirectory(test_opt) - add_library(meta-ranker absolute_discount.cpp dirichlet_prior.cpp - dirichlet_prior_opt.cpp + dirichlet_prior_opt.cpp jelinek_mercer.cpp lm_ranker.cpp okapi_bm25.cpp diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp index 3a43297d6..07536afbe 100644 --- a/src/index/ranker/dirichlet_prior.cpp +++ b/src/index/ranker/dirichlet_prior.cpp @@ -55,6 +55,5 @@ std::unique_ptr throw ranker_exception{"dirichlet-prior mu must be >= 0"}; return make_unique(mu); } - } } diff --git a/src/index/ranker/test_opt/CMakeLists.txt b/src/index/ranker/test_opt/CMakeLists.txt deleted file mode 100644 index 1e8cdc0ee..000000000 --- a/src/index/ranker/test_opt/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -project(meta-dirichlet-test) - -include_directories(../../../../include) - -add_executable(test_opt test.cpp) - -target_link_libraries(test_opt meta-ranker - meta-sequence-analyzers - meta-parser-analyzers) - diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp deleted file mode 100644 index a12771f53..000000000 --- a/src/index/ranker/test_opt/test.cpp +++ /dev/null @@ -1,79 +0,0 @@ -#include "meta/corpus/document.h" -#include "meta/index/ranker/all.h" -#include "meta/index/forward_index.h" - -#include - -#include "meta/index/inverted_index.h" -#include "meta/logging/logger.h" -#include "meta/parser/analyzers/tree_analyzer.h" -#include "meta/sequence/analyzers/ngram_pos_analyzer.h" -#include "meta/util/time.h" - -using namespace meta; - - -void display_result(float alpha, std::map alpha_m, float time){ - for (auto kv: alpha_m){ - std::cout << kv.second << " "; - } - std::cout << std::endl << alpha << std::endl << time << std::endl; -} - -int main(int argc, char* argv[]) -{ - if (argc != 2) - { - std::cerr << "Usage:\t" << argv[0] << " configFile" << std::endl; - return 1; - } - - // Turn on logging to std::cerr. - logging::set_cerr_logging(); - - // Register additional analyzers - parser::register_analyzers(); - sequence::register_analyzers(); - - // Creates an inverted index with no cache. We don't need a cache here - // since we're never searching the index, only building it. - auto config = cpptoml::parse_file(argv[1]); - auto idx = index::make_index(*config); - - double eps = 1e-6; - int iters = 10000; - - float alpha; - std::map alpha_m; - - index::dirichlet_digamma_rec ranker1; - index::dirichlet_log_approx ranker2; - index::dirichlet_mackay_peto ranker3; - - auto time1 = common::time([&]() - { - alpha_m = ranker1.get_optimized_mu(*idx, eps, iters); - alpha = ranker1.parameter(); - }); - - display_result(alpha, alpha_m, time1.count() / 1.0); - - auto time2 = common::time([&]() - { - alpha_m = ranker2.get_optimized_mu(*idx, eps, iters); - alpha = ranker2.parameter(); - }); - - display_result(alpha, alpha_m, time2.count() / 1.0); - - - auto time3 = common::time([&]() - { - alpha_m = ranker3.get_optimized_mu(*idx, eps, iters); - alpha = ranker3.parameter(); - }); - - display_result(alpha, alpha_m, time3.count() / 1.0); - - return 0; -}