From 1aa70324bc96494801533d7ee880424d4169ea10 Mon Sep 17 00:00:00 2001
From: Aleksey <alex2304el@gmail.com>
Date: Sat, 18 Nov 2017 17:52:22 +0300
Subject: [PATCH 01/30] + added optimization.h and .gitignore updated

---
 .gitignore                        |  2 ++
 include/meta/stats/optimization.h | 25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 include/meta/stats/optimization.h

diff --git a/.gitignore b/.gitignore
index 9167fd029..73e3a6ff6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,5 @@ data/cranfield
 biicode.conf
 bii/
 bin/
+*.pro
+*.pro.user
\ No newline at end of file
diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h
new file mode 100644
index 000000000..c623439b6
--- /dev/null
+++ b/include/meta/stats/optimization.h
@@ -0,0 +1,25 @@
+#ifndef OPTIMIZATION_H
+#define OPTIMIZATION_H
+
+#include "meta/embeddings/word_embeddings.h"
+
+using namespace meta::embeddings;
+
+namespace meta
+{
+namespace stats
+{
+namespace opt
+{
+
+// first method
+double minka_fpi(word_embeddings model){
+    for (auto term_it: model.vocab()){
+
+    }
+}
+
+}
+}
+}
+#endif // OPTIMIZATION_H

From 5e3e23c4623805352a357e4caca36b8e64cbe971 Mon Sep 17 00:00:00 2001
From: Aleksey <alex2304el@gmail.com>
Date: Sun, 19 Nov 2017 13:02:09 +0300
Subject: [PATCH 02/30] [opt] dirichlet_optimizer class, digamma function

---
 include/meta/stats/optimization.h |  57 +++++++++++++++--
 include/meta/stats/statistics.h   | 101 ++++++++++++++++++++++++++++++
 2 files changed, 152 insertions(+), 6 deletions(-)

diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h
index c623439b6..5fc13748b 100644
--- a/include/meta/stats/optimization.h
+++ b/include/meta/stats/optimization.h
@@ -2,8 +2,11 @@
 #define OPTIMIZATION_H
 
 #include "meta/embeddings/word_embeddings.h"
+#include "meta/stats/statistics.h"
 
+using namespace meta::stats;
 using namespace meta::embeddings;
+using namespace meta::util;
 
 namespace meta
 {
@@ -12,14 +15,56 @@ namespace stats
 namespace opt
 {
 
-// first method
-double minka_fpi(word_embeddings model){
-    for (auto term_it: model.vocab()){
+aligned_vector<std::string> get_docs_voc(aligned_vector<word_embeddings> docs_models){
+    aligned_vector<std::string> docs_voc();
 
+    for (auto m_iter: docs_models){
+        // todo
     }
-}
 
+    return docs_voc;
 }
-}
-}
+
+class dirichlet_optimizer{
+public:
+    dirichlet_optimizer(aligned_vector<word_embeddings> docs_models, int alpha=1){
+        this->docs_models_ = docs_models;
+        this->default_alpha_ = alpha;
+
+        this->docs_voc_ = get_docs_voc(docs_models);
+    }
+
+    double minka_fpi(){
+        double alpha = default_alpha_;
+
+        double nom, denom,
+                alpha_m,
+                alpha_dig, alpha_m_dig,
+                all_words_count, k_words_count;
+
+        // digamma(x)
+        for (std::string word: voc){
+            // todo
+        }
+    }
+
+    double minka_newton(){
+        // todo
+    }
+
+    double minka_lou(){
+        // todo
+    }
+
+private:
+    double minka_fpi_iters();
+    double minka_newton_iters();
+    double minka_lou_iters();
+
+    aligned_vector<word_embeddings> docs_models_;
+    aligned_vector<std::string> docs_voc_;
+
+    double default_alpha_;
+};
+
 #endif // OPTIMIZATION_H
diff --git a/include/meta/stats/statistics.h b/include/meta/stats/statistics.h
index d6823fed1..168af1245 100644
--- a/include/meta/stats/statistics.h
+++ b/include/meta/stats/statistics.h
@@ -19,6 +19,107 @@ namespace meta
 namespace stats
 {
 
+#ifndef M_PIl
+/** The constant Pi in high precision */
+#define M_PIl 3.1415926535897932384626433832795029L
+#endif
+#ifndef M_GAMMAl
+/** Euler's constant in high precision */
+#define M_GAMMAl 0.5772156649015328606065120900824024L
+#endif
+#ifndef M_LN2l
+/** the natural logarithm of 2 in high precision */
+#define M_LN2l 0.6931471805599453094172321214581766L
+#endif
+
+/** The digamma function in long double precision.
+* @param x the real value of the argument
+* @return the value of the digamma (psi) function at that point
+* @author Richard J. Mathar
+* @since 2005-11-24
+*/
+long double digamma(long double x)
+{
+    /* force into the interval 1..3 */
+    if( x < 0.0L )
+        return digamma(1.0L-x)+M_PIl/tanl(M_PIl*(1.0L-x)) ;	/* reflection formula */
+    else if( x < 1.0L )
+        return digamma(1.0L+x)-1.0L/x ;
+    else if ( x == 1.0L)
+        return -M_GAMMAl ;
+    else if ( x == 2.0L)
+        return 1.0L-M_GAMMAl ;
+    else if ( x == 3.0L)
+        return 1.5L-M_GAMMAl ;
+    else if ( x > 3.0L)
+        /* duplication formula */
+        return 0.5L*(digamma(x/2.0L)+digamma((x+1.0L)/2.0L))+M_LN2l ;
+    else
+    {
+        /* Just for your information, the following lines contain
+        * the Maple source code to re-generate the table that is
+        * eventually becoming the Kncoe[] array below
+        * interface(prettyprint=0) :
+        * Digits := 63 :
+        * r := 0 :
+        *
+        * for l from 1 to 60 do
+        * 	d := binomial(-1/2,l) :
+        * 	r := r+d*(-1)^l*(Zeta(2*l+1) -1) ;
+        * 	evalf(r) ;
+        * 	print(%,evalf(1+Psi(1)-r)) ;
+        *o d :
+        *
+        * for N from 1 to 28 do
+        * 	r := 0 :
+        * 	n := N-1 :
+        *
+        *	for l from iquo(n+3,2) to 70 do
+        *		d := 0 :
+        *		for s from 0 to n+1 do
+        *		 d := d+(-1)^s*binomial(n+1,s)*binomial((s-1)/2,l) :
+        *		od :
+        *		if 2*l-n > 1 then
+        *		r := r+d*(-1)^l*(Zeta(2*l-n) -1) :
+        *		fi :
+        *	od :
+        *	print(evalf((-1)^n*2*r)) ;
+        *od :
+        *quit :
+        */
+        static long double Kncoe[] = { .30459198558715155634315638246624251L,
+        .72037977439182833573548891941219706L, -.12454959243861367729528855995001087L,
+        .27769457331927827002810119567456810e-1L, -.67762371439822456447373550186163070e-2L,
+        .17238755142247705209823876688592170e-2L, -.44817699064252933515310345718960928e-3L,
+        .11793660000155572716272710617753373e-3L, -.31253894280980134452125172274246963e-4L,
+        .83173997012173283398932708991137488e-5L, -.22191427643780045431149221890172210e-5L,
+        .59302266729329346291029599913617915e-6L, -.15863051191470655433559920279603632e-6L,
+        .42459203983193603241777510648681429e-7L, -.11369129616951114238848106591780146e-7L,
+        .304502217295931698401459168423403510e-8L, -.81568455080753152802915013641723686e-9L,
+        .21852324749975455125936715817306383e-9L, -.58546491441689515680751900276454407e-10L,
+        .15686348450871204869813586459513648e-10L, -.42029496273143231373796179302482033e-11L,
+        .11261435719264907097227520956710754e-11L, -.30174353636860279765375177200637590e-12L,
+        .80850955256389526647406571868193768e-13L, -.21663779809421233144009565199997351e-13L,
+        .58047634271339391495076374966835526e-14L, -.15553767189204733561108869588173845e-14L,
+        .41676108598040807753707828039353330e-15L, -.11167065064221317094734023242188463e-15L } ;
+
+        register long double Tn_1 = 1.0L ;	/* T_{n-1}(x), started at n=1 */
+        register long double Tn = x-2.0L ;	/* T_{n}(x) , started at n=1 */
+        register long double resul = Kncoe[0] + Kncoe[1]*Tn ;
+
+        x -= 2.0L ;
+
+        for(int n = 2 ; n < sizeof(Kncoe)/sizeof(long double) ;n++)
+        {
+            const long double Tn1 = 2.0L * x * Tn - Tn_1 ;	/* Chebyshev recursion, Eq. 22.7.4 Abramowitz-Stegun */
+            resul += Kncoe[n]*Tn1 ;
+            Tn_1 = Tn ;
+            Tn = Tn1 ;
+        }
+        return resul ;
+    }
+}
+
 /**
  * Computation for \f$E_d[f(x)]\f$ where \f$d\f$ is specified by the
  * `dist` parameter and \f$f(x)\f$ is the `fun` parameter.

From b8dbc7d8ebd66c399587ac7bc6abf4c3f57d610c Mon Sep 17 00:00:00 2001
From: Aleksey <alex2304el@gmail.com>
Date: Sun, 19 Nov 2017 23:16:17 +0300
Subject: [PATCH 03/30] [opt] minka_fpi method draft

---
 include/meta/stats/optimization.h | 108 ++++++++++++++++++++++++------
 1 file changed, 89 insertions(+), 19 deletions(-)

diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h
index 5fc13748b..cf85b8609 100644
--- a/include/meta/stats/optimization.h
+++ b/include/meta/stats/optimization.h
@@ -3,10 +3,14 @@
 
 #include "meta/embeddings/word_embeddings.h"
 #include "meta/stats/statistics.h"
+#include "meta/analyzers/featurizer.h"
+
+#include <cmath>
 
 using namespace meta::stats;
 using namespace meta::embeddings;
 using namespace meta::util;
+using namespace meta::analyzers;
 
 namespace meta
 {
@@ -15,37 +19,85 @@ namespace stats
 namespace opt
 {
 
-aligned_vector<std::string> get_docs_voc(aligned_vector<word_embeddings> docs_models){
-    aligned_vector<std::string> docs_voc();
+aligned_vector<long> get_docs_sizes(aligned_vector<feature_map<long>> docs_models){
+    aligned_vector<long> docs_sizes;
 
-    for (auto m_iter: docs_models){
-        // todo
+    long doc_size;
+    for (int i = 0; i < docs_models.size(); i++){
+        doc_size = 0;
+
+        for (auto word: docs_models[i]){
+            doc_size += docs_models[i][word];
+        }
+
+        docs_sizes.push_back(doc_size);
     }
 
-    return docs_voc;
+    return docs_sizes;
+}
+
+feature_map<long> get_ref_voc(aligned_vector<feature_map<long>> docs_models){
+    feature_map<long> ref_voc;
+    featurizer<long> f(ref_voc);
+
+    for (feature_map<long> doc_model: docs_models){
+        for (auto word: doc_model){
+            f(word, doc_model[word]);
+        }
+    }
+
+    return ref_voc;
 }
 
 class dirichlet_optimizer{
 public:
-    dirichlet_optimizer(aligned_vector<word_embeddings> docs_models, int alpha=1){
+    dirichlet_optimizer(aligned_vector<feature_map<long>> docs_models, int alpha=1){
         this->docs_models_ = docs_models;
+        this->docs_sizes_ = get_docs_sizes(docs_models);
+
         this->default_alpha_ = alpha;
 
-        this->docs_voc_ = get_docs_voc(docs_models);
+        this->ref_voc_ = get_docs_voc(docs_models);
     }
 
-    double minka_fpi(){
-        double alpha = default_alpha_;
+    typedef std::map<std::string, double> text_vector;
+
+    text_vector minka_fpi(double eps=1e-6, int max_iters=100){
+        std::map<std::string, double> alpha_m;
+
+        // create initial alpa_m vector
+        for (auto word: ref_voc_){
+            alpha_m[word] = default_alpha_ * ref_voc_[word];
+        }
+
+        // stoping criteria for the whole vector alpha_m
+        int vector_iteration = 0;
+        double l_dist = std::numeric_limits::infinity();
+        bool all_optimal = true;
+
+        while (vector_iteration <= max_iters && !all_optimal){
+            all_optimal = true;
+            std::string word_k;
+            double alpha_m_k, alpha_k, alpha_m_k_new;
+
+            for (auto alpha_m_iter: alpha_m){
+                word_k = alpha_m_iter.first;
+                alpha_m_k = alpha_m_iter.second;
+
+                alpha_k = alpha_m_k / ref_voc_[word_k];
 
-        double nom, denom,
-                alpha_m,
-                alpha_dig, alpha_m_dig,
-                all_words_count, k_words_count;
+                // make a step and find new alpha_m_k
+                alpha_m_k_new = minka_fpi_step(word_k, alpha_k, alpha_m_k);
 
-        // digamma(x)
-        for (std::string word: voc){
-            // todo
+                if (!is_optimal(alpha_m_k, alpha_m_k_new)){
+                    all_optimal = false;
+
+                    alpha_m[word_k] = alpha_m_k_new;
+                }
+            }
         }
+
+        return alpha_m;
     }
 
     double minka_newton(){
@@ -57,12 +109,30 @@ class dirichlet_optimizer{
     }
 
 private:
-    double minka_fpi_iters();
+    double minka_fpi_step(std::string word_k, double alpha_k, double alpha_m_k){
+        double nom = 0, denom = 0;
+
+        double alpha_m_k_dig = digamma(alpha_m_k),
+                alpha_k_dig = digamma(alpha_k);
+
+        long all_words_count, k_words_count;
+
+        for (int d = 0; d < docs_models_.size(); d++){
+            nom += digamma(docs_models_[d][word_k] + alpha_m_k) - alpha_m_k_dig;
+
+            denom += digamma(docs_sizes_[d] + alpha_k) - alpha_k_dig;
+        }
+
+        return alpha_m_k * nom / denom;
+    }
+
     double minka_newton_iters();
     double minka_lou_iters();
 
-    aligned_vector<word_embeddings> docs_models_;
-    aligned_vector<std::string> docs_voc_;
+    aligned_vector<feature_map<long>> docs_models_;
+    aligned_vector<long> docs_sizes_;
+
+    feature_map<long> ref_voc_;
 
     double default_alpha_;
 };

From 3dc03a8caf772a925b8843c7c90c2512317e1306 Mon Sep 17 00:00:00 2001
From: Aleksey <alex2304el@gmail.com>
Date: Mon, 20 Nov 2017 17:51:05 +0300
Subject: [PATCH 04/30] [opt] optimization.h errors fixed, test without MeTa

---
 include/meta/stats/opt_test.cpp   |  90 +++++++++++++
 include/meta/stats/optimization.h | 212 ++++++++++++++++++++++++++----
 2 files changed, 274 insertions(+), 28 deletions(-)
 create mode 100644 include/meta/stats/opt_test.cpp

diff --git a/include/meta/stats/opt_test.cpp b/include/meta/stats/opt_test.cpp
new file mode 100644
index 000000000..314873bb4
--- /dev/null
+++ b/include/meta/stats/opt_test.cpp
@@ -0,0 +1,90 @@
+#include <iostream>
+#include <sstream>
+
+#include "optimization.h"
+
+#ifndef TEST_OPT
+#define TEST_OPT 1
+#endif // TEST_OPT
+
+using namespace meta::stats::opt;
+
+#ifdef TEST_OPT
+int main(){
+    feature_map<celoe> dm1, dm2, dm3;
+
+    dm1["1"] = 8;
+    dm1["2"] = 3;
+
+    dm2["2"] = 4;
+    dm2["3"] = 3;
+
+    dm3["3"] = 4;
+    dm3["4"] = 6;
+
+    std::vector<feature_map<celoe>> dms;
+    dms.push_back(dm1);
+    dms.push_back(dm2);
+    dms.push_back(dm3);
+
+    dirichlet_optimizer optimizer(dms);
+
+    auto optimized = optimizer.minka_fpi();
+
+    for (auto iter: optimized){
+        std::cout << iter.first << " " << iter.second << std::endl;
+    }
+}
+#else
+#include "meta/stats/optimization.h"
+#include "meta/analyzers/ngram/ngram_word_analyzer.h"
+#include "meta/corpus/document.h"
+#include "meta/analyzers/token_stream.h"
+#include "meta/analyzers/tokenizers/character_tokenizer.h"
+
+#include "../tests/create_config.h"
+#include "meta/meta.h"
+
+#include "../src/analyzers/analyzer.cpp"
+
+using namespace meta::stats::opt;
+using namespace meta::analyzers;
+using namespace meta::corpus;
+using namespace meta::analyzers::tokenizers;
+using namespace meta::tests;
+
+std::unique_ptr<token_stream> make_filter() {
+    auto line_cfg = create_config("line");
+    return default_filter_chain(*line_cfg);
+}
+
+
+int main(){
+    document doc1(meta::doc_id{47}), doc2(meta::doc_id{48}), doc3(meta::doc_id{49});
+    doc1.content("Quaia Quaia Coronoid");
+    doc2.content("Dj extra Quaia Quaia");
+    doc3.content("Coronoid Coronoid Diagram Dj");
+
+    character_tokenizer tokenizer;
+
+    tokenizer.set_content("Quaia Quaia Coronoid");
+
+    std::vector<feature_map<celoe>> docs_models;
+
+    ngram_word_analyzer anal(1, make_filter());
+
+    docs_models.push_back(anal.analyze<celoe>(doc1));
+    docs_models.push_back(anal.analyze<celoe>(doc2));
+    docs_models.push_back(anal.analyze<celoe>(doc3));
+
+    dirichlet_optimizer optimizer(docs_models);
+
+    auto res_map = optimizer.minka_fpi();
+
+    for (auto iter: res_map){
+        std::cout << iter.first << " " << iter.second << std::endl;
+    }
+
+    return 0;
+}
+#endif // TEST_OPT
diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h
index cf85b8609..4ee923e46 100644
--- a/include/meta/stats/optimization.h
+++ b/include/meta/stats/optimization.h
@@ -1,17 +1,129 @@
 #ifndef OPTIMIZATION_H
 #define OPTIMIZATION_H
 
-#include "meta/embeddings/word_embeddings.h"
-#include "meta/stats/statistics.h"
-#include "meta/analyzers/featurizer.h"
+#ifndef TEST_OPT
+#define TEST_OPT 1
+#endif // TEST_OPT
 
 #include <cmath>
+#include <map>
+#include <vector>
+
+#ifndef TEST_OPT
+#include "meta/stats/statistics.h"
+#include "meta/analyzers/featurizer.h"
 
 using namespace meta::stats;
-using namespace meta::embeddings;
 using namespace meta::util;
 using namespace meta::analyzers;
 
+#else
+
+#ifndef M_PIl
+/** The constant Pi in high precision */
+#define M_PIl 3.1415926535897932384626433832795029L
+#endif
+#ifndef M_GAMMAl
+/** Euler's constant in high precision */
+#define M_GAMMAl 0.5772156649015328606065120900824024L
+#endif
+#ifndef M_LN2l
+/** the natural logarithm of 2 in high precision */
+#define M_LN2l 0.6931471805599453094172321214581766L
+#endif
+
+/** The digamma function in long double precision.
+* @param x the real value of the argument
+* @return the value of the digamma (psi) function at that point
+* @author Richard J. Mathar
+* @since 2005-11-24
+*/
+long double digamma(long double x)
+{
+    /* force into the interval 1..3 */
+    if( x < 0.0L )
+        return digamma(1.0L-x)+M_PIl/tanl(M_PIl*(1.0L-x)) ;	/* reflection formula */
+    else if( x < 1.0L )
+        return digamma(1.0L+x)-1.0L/x ;
+    else if ( x == 1.0L)
+        return -M_GAMMAl ;
+    else if ( x == 2.0L)
+        return 1.0L-M_GAMMAl ;
+    else if ( x == 3.0L)
+        return 1.5L-M_GAMMAl ;
+    else if ( x > 3.0L)
+        /* duplication formula */
+        return 0.5L*(digamma(x/2.0L)+digamma((x+1.0L)/2.0L))+M_LN2l ;
+    else
+    {
+        /* Just for your information, the following lines contain
+        * the Maple source code to re-generate the table that is
+        * eventually becoming the Kncoe[] array below
+        * interface(prettyprint=0) :
+        * Digits := 63 :
+        * r := 0 :
+        *
+        * for l from 1 to 60 do
+        * 	d := binomial(-1/2,l) :
+        * 	r := r+d*(-1)^l*(Zeta(2*l+1) -1) ;
+        * 	evalf(r) ;
+        * 	print(%,evalf(1+Psi(1)-r)) ;
+        *o d :
+        *
+        * for N from 1 to 28 do
+        * 	r := 0 :
+        * 	n := N-1 :
+        *
+        *	for l from iquo(n+3,2) to 70 do
+        *		d := 0 :
+        *		for s from 0 to n+1 do
+        *		 d := d+(-1)^s*binomial(n+1,s)*binomial((s-1)/2,l) :
+        *		od :
+        *		if 2*l-n > 1 then
+        *		r := r+d*(-1)^l*(Zeta(2*l-n) -1) :
+        *		fi :
+        *	od :
+        *	print(evalf((-1)^n*2*r)) ;
+        *od :
+        *quit :
+        */
+        static long double Kncoe[] = { .30459198558715155634315638246624251L,
+        .72037977439182833573548891941219706L, -.12454959243861367729528855995001087L,
+        .27769457331927827002810119567456810e-1L, -.67762371439822456447373550186163070e-2L,
+        .17238755142247705209823876688592170e-2L, -.44817699064252933515310345718960928e-3L,
+        .11793660000155572716272710617753373e-3L, -.31253894280980134452125172274246963e-4L,
+        .83173997012173283398932708991137488e-5L, -.22191427643780045431149221890172210e-5L,
+        .59302266729329346291029599913617915e-6L, -.15863051191470655433559920279603632e-6L,
+        .42459203983193603241777510648681429e-7L, -.11369129616951114238848106591780146e-7L,
+        .304502217295931698401459168423403510e-8L, -.81568455080753152802915013641723686e-9L,
+        .21852324749975455125936715817306383e-9L, -.58546491441689515680751900276454407e-10L,
+        .15686348450871204869813586459513648e-10L, -.42029496273143231373796179302482033e-11L,
+        .11261435719264907097227520956710754e-11L, -.30174353636860279765375177200637590e-12L,
+        .80850955256389526647406571868193768e-13L, -.21663779809421233144009565199997351e-13L,
+        .58047634271339391495076374966835526e-14L, -.15553767189204733561108869588173845e-14L,
+        .41676108598040807753707828039353330e-15L, -.11167065064221317094734023242188463e-15L } ;
+
+        register long double Tn_1 = 1.0L ;	/* T_{n-1}(x), started at n=1 */
+        register long double Tn = x-2.0L ;	/* T_{n}(x) , started at n=1 */
+        register long double resul = Kncoe[0] + Kncoe[1]*Tn ;
+
+        x -= 2.0L ;
+
+        for(int n = 2 ; n < sizeof(Kncoe)/sizeof(long double) ;n++)
+        {
+            const long double Tn1 = 2.0L * x * Tn - Tn_1 ;	/* Chebyshev recursion, Eq. 22.7.4 Abramowitz-Stegun */
+            resul += Kncoe[n]*Tn1 ;
+            Tn_1 = Tn ;
+            Tn = Tn1 ;
+        }
+        return resul ;
+    }
+}
+
+template <class T> using feature_map = std::map<std::string, T>;
+
+#endif // TEST_OPT
+
 namespace meta
 {
 namespace stats
@@ -19,15 +131,17 @@ namespace stats
 namespace opt
 {
 
-aligned_vector<long> get_docs_sizes(aligned_vector<feature_map<long>> docs_models){
-    aligned_vector<long> docs_sizes;
+typedef uint64_t celoe;
+
+std::vector<long> get_docs_sizes(std::vector<feature_map<celoe>> docs_models){
+    std::vector<long> docs_sizes;
 
     long doc_size;
     for (int i = 0; i < docs_models.size(); i++){
         doc_size = 0;
 
         for (auto word: docs_models[i]){
-            doc_size += docs_models[i][word];
+            doc_size += docs_models[i][word.first];
         }
 
         docs_sizes.push_back(doc_size);
@@ -36,60 +150,93 @@ aligned_vector<long> get_docs_sizes(aligned_vector<feature_map<long>> docs_model
     return docs_sizes;
 }
 
-feature_map<long> get_ref_voc(aligned_vector<feature_map<long>> docs_models){
-    feature_map<long> ref_voc;
-    featurizer<long> f(ref_voc);
+#ifndef TEST_OPT
+feature_map<celoe> get_ref_voc(std::vector<feature_map<celoe>> docs_models){
+    feature_map<celoe> ref_voc;
+    featurizer f(ref_voc);
+
+    for (auto doc_model: docs_models){
+        for (auto word: doc_model){
+            f(word.key(), word.value());
+        }
+    }
+
+    return ref_voc;
+}
+
+#else
+
+feature_map<celoe> get_ref_voc(std::vector<feature_map<celoe>> docs_models){
+    feature_map<celoe> ref_voc;
 
-    for (feature_map<long> doc_model: docs_models){
+    for (auto doc_model: docs_models){
         for (auto word: doc_model){
-            f(word, doc_model[word]);
+            ref_voc[word.first] += word.second;
         }
     }
 
     return ref_voc;
 }
 
+celoe get_ref_voc_size(feature_map<celoe> ref_voc){
+    celoe ref_voc_size = 0;
+
+    for (auto word: ref_voc){
+        ref_voc_size += word.second;
+    }
+
+    return ref_voc_size;
+}
+
+#endif // TEST_OPT
+
+#include <iostream>
+using namespace std;
+
 class dirichlet_optimizer{
 public:
-    dirichlet_optimizer(aligned_vector<feature_map<long>> docs_models, int alpha=1){
-        this->docs_models_ = docs_models;
+    dirichlet_optimizer(std::vector<feature_map<celoe>> docs_models, int alpha=1)
+    {
+        this->docs_models_.assign(docs_models.begin(), docs_models.end());
         this->docs_sizes_ = get_docs_sizes(docs_models);
 
         this->default_alpha_ = alpha;
 
-        this->ref_voc_ = get_docs_voc(docs_models);
-    }
+        this->ref_voc_ = get_ref_voc(docs_models);
+        this->ref_voc_size_ = get_ref_voc_size(this->ref_voc_);
 
-    typedef std::map<std::string, double> text_vector;
+        cout << this->ref_voc_size_ << endl;
+    }
 
-    text_vector minka_fpi(double eps=1e-6, int max_iters=100){
+    std::map<std::string, double> minka_fpi(double eps=1e-6, int max_iters=100){
         std::map<std::string, double> alpha_m;
 
         // create initial alpa_m vector
         for (auto word: ref_voc_){
-            alpha_m[word] = default_alpha_ * ref_voc_[word];
+            alpha_m[word.first] = default_alpha_ * word.second / ref_voc_size_;
         }
 
         // stoping criteria for the whole vector alpha_m
         int vector_iteration = 0;
-        double l_dist = std::numeric_limits::infinity();
-        bool all_optimal = true;
+        bool all_optimal;
 
         while (vector_iteration <= max_iters && !all_optimal){
             all_optimal = true;
             std::string word_k;
             double alpha_m_k, alpha_k, alpha_m_k_new;
 
+            cout << endl;
+
             for (auto alpha_m_iter: alpha_m){
                 word_k = alpha_m_iter.first;
                 alpha_m_k = alpha_m_iter.second;
 
-                alpha_k = alpha_m_k / ref_voc_[word_k];
+                alpha_k = alpha_m_k / ((double)ref_voc_[word_k] / ref_voc_size_);
 
                 // make a step and find new alpha_m_k
                 alpha_m_k_new = minka_fpi_step(word_k, alpha_k, alpha_m_k);
 
-                if (!is_optimal(alpha_m_k, alpha_m_k_new)){
+                if (std::abs(alpha_m_k - alpha_m_k_new) > eps){
                     all_optimal = false;
 
                     alpha_m[word_k] = alpha_m_k_new;
@@ -111,30 +258,39 @@ class dirichlet_optimizer{
 private:
     double minka_fpi_step(std::string word_k, double alpha_k, double alpha_m_k){
         double nom = 0, denom = 0;
-
         double alpha_m_k_dig = digamma(alpha_m_k),
                 alpha_k_dig = digamma(alpha_k);
 
         long all_words_count, k_words_count;
 
         for (int d = 0; d < docs_models_.size(); d++){
+
             nom += digamma(docs_models_[d][word_k] + alpha_m_k) - alpha_m_k_dig;
 
             denom += digamma(docs_sizes_[d] + alpha_k) - alpha_k_dig;
+
         }
 
-        return alpha_m_k * nom / denom;
+        double alpha_m_k_new = alpha_m_k * nom / denom;;
+
+        cout << word_k << " " << alpha_k << " " << alpha_m_k << " " << alpha_m_k / ((double)ref_voc_[word_k] / ref_voc_size_) << " " << alpha_m_k_new << " " << nom << " " << denom << endl;
+
+        return alpha_m_k_new;
     }
 
     double minka_newton_iters();
     double minka_lou_iters();
 
-    aligned_vector<feature_map<long>> docs_models_;
-    aligned_vector<long> docs_sizes_;
+    std::vector<feature_map<celoe>> docs_models_;
+    std::vector<long> docs_sizes_;
 
-    feature_map<long> ref_voc_;
+    feature_map<celoe> ref_voc_;
+    celoe ref_voc_size_;
 
     double default_alpha_;
 };
+}
+}
+}
 
 #endif // OPTIMIZATION_H

From eeb91688b8ee66d630255c42004dd35307f0fd71 Mon Sep 17 00:00:00 2001
From: Aleksey <alex2304el@gmail.com>
Date: Mon, 20 Nov 2017 19:43:16 +0300
Subject: [PATCH 05/30] [opt] debug output

---
 include/meta/stats/opt_test.cpp   |  6 ++----
 include/meta/stats/optimization.h | 16 +++++++++++++---
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/meta/stats/opt_test.cpp b/include/meta/stats/opt_test.cpp
index 314873bb4..6bb857362 100644
--- a/include/meta/stats/opt_test.cpp
+++ b/include/meta/stats/opt_test.cpp
@@ -29,11 +29,9 @@ int main(){
 
     dirichlet_optimizer optimizer(dms);
 
-    auto optimized = optimizer.minka_fpi();
+    auto optimal_alpha = optimizer.minka_fpi();
 
-    for (auto iter: optimized){
-        std::cout << iter.first << " " << iter.second << std::endl;
-    }
+    cout << endl << "optimal alpha: " << optimal_alpha;
 }
 #else
 #include "meta/stats/optimization.h"
diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h
index 4ee923e46..06ab27f14 100644
--- a/include/meta/stats/optimization.h
+++ b/include/meta/stats/optimization.h
@@ -208,7 +208,7 @@ class dirichlet_optimizer{
         cout << this->ref_voc_size_ << endl;
     }
 
-    std::map<std::string, double> minka_fpi(double eps=1e-6, int max_iters=100){
+    double minka_fpi(double eps=1e-3, int max_iters=100){
         std::map<std::string, double> alpha_m;
 
         // create initial alpa_m vector
@@ -225,7 +225,7 @@ class dirichlet_optimizer{
             std::string word_k;
             double alpha_m_k, alpha_k, alpha_m_k_new;
 
-            cout << endl;
+            cout << vector_iteration << endl;
 
             for (auto alpha_m_iter: alpha_m){
                 word_k = alpha_m_iter.first;
@@ -242,9 +242,19 @@ class dirichlet_optimizer{
                     alpha_m[word_k] = alpha_m_k_new;
                 }
             }
+
+            vector_iteration++;
+        }
+
+        cout << endl << "Alpha_m for each word:" << endl;
+
+        double optimal_alpha = 0;
+        for (auto alpha_m_iter: alpha_m){
+            cout << alpha_m_iter.first << " " << alpha_m_iter.second << std::endl;
+            optimal_alpha += alpha_m_iter.second;
         }
 
-        return alpha_m;
+        return optimal_alpha;
     }
 
     double minka_newton(){

From 766754f9726dd16c7709a7c8716fc1643d59dd74 Mon Sep 17 00:00:00 2001
From: MakKolts <makkolts@gmail.com>
Date: Mon, 20 Nov 2017 23:32:35 +0300
Subject: [PATCH 06/30] Adding optimization.cpp

---
 src/stats/CMakeLists.txt   | 2 +-
 src/stats/optimization.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 100644 src/stats/optimization.cpp

diff --git a/src/stats/CMakeLists.txt b/src/stats/CMakeLists.txt
index c9872741c..3e09824f2 100644
--- a/src/stats/CMakeLists.txt
+++ b/src/stats/CMakeLists.txt
@@ -1,6 +1,6 @@
 project(meta-stats)
 
-add_library(meta-stats running_stats.cpp)
+add_library(meta-stats running_stats.cpp optimization.cpp)
 target_link_libraries(meta-stats meta-definitions)
 
 install(TARGETS meta-stats
diff --git a/src/stats/optimization.cpp b/src/stats/optimization.cpp
new file mode 100644
index 000000000..87a7f3af4
--- /dev/null
+++ b/src/stats/optimization.cpp
@@ -0,0 +1 @@
+#include "meta/stats/optimization.h"

From e9c99df1d85a12baee46b311ee92e518db1377c4 Mon Sep 17 00:00:00 2001
From: Aleksey <alex2304el@gmail.com>
Date: Wed, 29 Nov 2017 17:14:58 +0300
Subject: [PATCH 07/30] [opt] classes for methods in dirichlet_prior

---
 include/meta/index/ranker/dirichlet_prior.h |  23 ++
 include/meta/stats/opt_test.cpp             |  88 ------
 include/meta/stats/optimization.h           | 306 --------------------
 src/index/ranker/dirichlet_prior.cpp        |   2 +-
 4 files changed, 24 insertions(+), 395 deletions(-)
 delete mode 100644 include/meta/stats/opt_test.cpp
 delete mode 100644 include/meta/stats/optimization.h

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index dfb5aef42..37ff18504 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -71,6 +71,29 @@ class dirichlet_prior : public language_model_ranker
     const float mu_;
 };
 
+class dirichlet_prior_opt : public dirichlet_prior{
+    void rank(ranker_context &ctx, uint64_t num_results, const filter_function_type &filter) const override{
+        // optimize mu according to ranker_context before ranking
+        this->optimize_mu(ctx);
+
+        ranking_function::rank(ctx, num_results, filter);
+    }
+
+    virtual void optimize_mu(const ranker_context &ctx) = 0;
+};
+
+class digamma_rec: public dirichlet_prior_opt{
+    void optimize_mu(const ranker_context &ctx) override;
+};
+
+class log_approx: public dirichlet_prior_opt{
+    void optimize_mu(const ranker_context &ctx) override;
+};
+
+class mackay_peto: public dirichlet_prior_opt{
+    void optimize_mu(const ranker_context &ctx) override;
+};
+
 /**
  * Specialization of the factory method used to create dirichlet_prior
  * rankers.
diff --git a/include/meta/stats/opt_test.cpp b/include/meta/stats/opt_test.cpp
deleted file mode 100644
index 6bb857362..000000000
--- a/include/meta/stats/opt_test.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-#include <iostream>
-#include <sstream>
-
-#include "optimization.h"
-
-#ifndef TEST_OPT
-#define TEST_OPT 1
-#endif // TEST_OPT
-
-using namespace meta::stats::opt;
-
-#ifdef TEST_OPT
-int main(){
-    feature_map<celoe> dm1, dm2, dm3;
-
-    dm1["1"] = 8;
-    dm1["2"] = 3;
-
-    dm2["2"] = 4;
-    dm2["3"] = 3;
-
-    dm3["3"] = 4;
-    dm3["4"] = 6;
-
-    std::vector<feature_map<celoe>> dms;
-    dms.push_back(dm1);
-    dms.push_back(dm2);
-    dms.push_back(dm3);
-
-    dirichlet_optimizer optimizer(dms);
-
-    auto optimal_alpha = optimizer.minka_fpi();
-
-    cout << endl << "optimal alpha: " << optimal_alpha;
-}
-#else
-#include "meta/stats/optimization.h"
-#include "meta/analyzers/ngram/ngram_word_analyzer.h"
-#include "meta/corpus/document.h"
-#include "meta/analyzers/token_stream.h"
-#include "meta/analyzers/tokenizers/character_tokenizer.h"
-
-#include "../tests/create_config.h"
-#include "meta/meta.h"
-
-#include "../src/analyzers/analyzer.cpp"
-
-using namespace meta::stats::opt;
-using namespace meta::analyzers;
-using namespace meta::corpus;
-using namespace meta::analyzers::tokenizers;
-using namespace meta::tests;
-
-std::unique_ptr<token_stream> make_filter() {
-    auto line_cfg = create_config("line");
-    return default_filter_chain(*line_cfg);
-}
-
-
-int main(){
-    document doc1(meta::doc_id{47}), doc2(meta::doc_id{48}), doc3(meta::doc_id{49});
-    doc1.content("Quaia Quaia Coronoid");
-    doc2.content("Dj extra Quaia Quaia");
-    doc3.content("Coronoid Coronoid Diagram Dj");
-
-    character_tokenizer tokenizer;
-
-    tokenizer.set_content("Quaia Quaia Coronoid");
-
-    std::vector<feature_map<celoe>> docs_models;
-
-    ngram_word_analyzer anal(1, make_filter());
-
-    docs_models.push_back(anal.analyze<celoe>(doc1));
-    docs_models.push_back(anal.analyze<celoe>(doc2));
-    docs_models.push_back(anal.analyze<celoe>(doc3));
-
-    dirichlet_optimizer optimizer(docs_models);
-
-    auto res_map = optimizer.minka_fpi();
-
-    for (auto iter: res_map){
-        std::cout << iter.first << " " << iter.second << std::endl;
-    }
-
-    return 0;
-}
-#endif // TEST_OPT
diff --git a/include/meta/stats/optimization.h b/include/meta/stats/optimization.h
deleted file mode 100644
index 06ab27f14..000000000
--- a/include/meta/stats/optimization.h
+++ /dev/null
@@ -1,306 +0,0 @@
-#ifndef OPTIMIZATION_H
-#define OPTIMIZATION_H
-
-#ifndef TEST_OPT
-#define TEST_OPT 1
-#endif // TEST_OPT
-
-#include <cmath>
-#include <map>
-#include <vector>
-
-#ifndef TEST_OPT
-#include "meta/stats/statistics.h"
-#include "meta/analyzers/featurizer.h"
-
-using namespace meta::stats;
-using namespace meta::util;
-using namespace meta::analyzers;
-
-#else
-
-#ifndef M_PIl
-/** The constant Pi in high precision */
-#define M_PIl 3.1415926535897932384626433832795029L
-#endif
-#ifndef M_GAMMAl
-/** Euler's constant in high precision */
-#define M_GAMMAl 0.5772156649015328606065120900824024L
-#endif
-#ifndef M_LN2l
-/** the natural logarithm of 2 in high precision */
-#define M_LN2l 0.6931471805599453094172321214581766L
-#endif
-
-/** The digamma function in long double precision.
-* @param x the real value of the argument
-* @return the value of the digamma (psi) function at that point
-* @author Richard J. Mathar
-* @since 2005-11-24
-*/
-long double digamma(long double x)
-{
-    /* force into the interval 1..3 */
-    if( x < 0.0L )
-        return digamma(1.0L-x)+M_PIl/tanl(M_PIl*(1.0L-x)) ;	/* reflection formula */
-    else if( x < 1.0L )
-        return digamma(1.0L+x)-1.0L/x ;
-    else if ( x == 1.0L)
-        return -M_GAMMAl ;
-    else if ( x == 2.0L)
-        return 1.0L-M_GAMMAl ;
-    else if ( x == 3.0L)
-        return 1.5L-M_GAMMAl ;
-    else if ( x > 3.0L)
-        /* duplication formula */
-        return 0.5L*(digamma(x/2.0L)+digamma((x+1.0L)/2.0L))+M_LN2l ;
-    else
-    {
-        /* Just for your information, the following lines contain
-        * the Maple source code to re-generate the table that is
-        * eventually becoming the Kncoe[] array below
-        * interface(prettyprint=0) :
-        * Digits := 63 :
-        * r := 0 :
-        *
-        * for l from 1 to 60 do
-        * 	d := binomial(-1/2,l) :
-        * 	r := r+d*(-1)^l*(Zeta(2*l+1) -1) ;
-        * 	evalf(r) ;
-        * 	print(%,evalf(1+Psi(1)-r)) ;
-        *o d :
-        *
-        * for N from 1 to 28 do
-        * 	r := 0 :
-        * 	n := N-1 :
-        *
-        *	for l from iquo(n+3,2) to 70 do
-        *		d := 0 :
-        *		for s from 0 to n+1 do
-        *		 d := d+(-1)^s*binomial(n+1,s)*binomial((s-1)/2,l) :
-        *		od :
-        *		if 2*l-n > 1 then
-        *		r := r+d*(-1)^l*(Zeta(2*l-n) -1) :
-        *		fi :
-        *	od :
-        *	print(evalf((-1)^n*2*r)) ;
-        *od :
-        *quit :
-        */
-        static long double Kncoe[] = { .30459198558715155634315638246624251L,
-        .72037977439182833573548891941219706L, -.12454959243861367729528855995001087L,
-        .27769457331927827002810119567456810e-1L, -.67762371439822456447373550186163070e-2L,
-        .17238755142247705209823876688592170e-2L, -.44817699064252933515310345718960928e-3L,
-        .11793660000155572716272710617753373e-3L, -.31253894280980134452125172274246963e-4L,
-        .83173997012173283398932708991137488e-5L, -.22191427643780045431149221890172210e-5L,
-        .59302266729329346291029599913617915e-6L, -.15863051191470655433559920279603632e-6L,
-        .42459203983193603241777510648681429e-7L, -.11369129616951114238848106591780146e-7L,
-        .304502217295931698401459168423403510e-8L, -.81568455080753152802915013641723686e-9L,
-        .21852324749975455125936715817306383e-9L, -.58546491441689515680751900276454407e-10L,
-        .15686348450871204869813586459513648e-10L, -.42029496273143231373796179302482033e-11L,
-        .11261435719264907097227520956710754e-11L, -.30174353636860279765375177200637590e-12L,
-        .80850955256389526647406571868193768e-13L, -.21663779809421233144009565199997351e-13L,
-        .58047634271339391495076374966835526e-14L, -.15553767189204733561108869588173845e-14L,
-        .41676108598040807753707828039353330e-15L, -.11167065064221317094734023242188463e-15L } ;
-
-        register long double Tn_1 = 1.0L ;	/* T_{n-1}(x), started at n=1 */
-        register long double Tn = x-2.0L ;	/* T_{n}(x) , started at n=1 */
-        register long double resul = Kncoe[0] + Kncoe[1]*Tn ;
-
-        x -= 2.0L ;
-
-        for(int n = 2 ; n < sizeof(Kncoe)/sizeof(long double) ;n++)
-        {
-            const long double Tn1 = 2.0L * x * Tn - Tn_1 ;	/* Chebyshev recursion, Eq. 22.7.4 Abramowitz-Stegun */
-            resul += Kncoe[n]*Tn1 ;
-            Tn_1 = Tn ;
-            Tn = Tn1 ;
-        }
-        return resul ;
-    }
-}
-
-template <class T> using feature_map = std::map<std::string, T>;
-
-#endif // TEST_OPT
-
-namespace meta
-{
-namespace stats
-{
-namespace opt
-{
-
-typedef uint64_t celoe;
-
-std::vector<long> get_docs_sizes(std::vector<feature_map<celoe>> docs_models){
-    std::vector<long> docs_sizes;
-
-    long doc_size;
-    for (int i = 0; i < docs_models.size(); i++){
-        doc_size = 0;
-
-        for (auto word: docs_models[i]){
-            doc_size += docs_models[i][word.first];
-        }
-
-        docs_sizes.push_back(doc_size);
-    }
-
-    return docs_sizes;
-}
-
-#ifndef TEST_OPT
-feature_map<celoe> get_ref_voc(std::vector<feature_map<celoe>> docs_models){
-    feature_map<celoe> ref_voc;
-    featurizer f(ref_voc);
-
-    for (auto doc_model: docs_models){
-        for (auto word: doc_model){
-            f(word.key(), word.value());
-        }
-    }
-
-    return ref_voc;
-}
-
-#else
-
-feature_map<celoe> get_ref_voc(std::vector<feature_map<celoe>> docs_models){
-    feature_map<celoe> ref_voc;
-
-    for (auto doc_model: docs_models){
-        for (auto word: doc_model){
-            ref_voc[word.first] += word.second;
-        }
-    }
-
-    return ref_voc;
-}
-
-celoe get_ref_voc_size(feature_map<celoe> ref_voc){
-    celoe ref_voc_size = 0;
-
-    for (auto word: ref_voc){
-        ref_voc_size += word.second;
-    }
-
-    return ref_voc_size;
-}
-
-#endif // TEST_OPT
-
-#include <iostream>
-using namespace std;
-
-class dirichlet_optimizer{
-public:
-    dirichlet_optimizer(std::vector<feature_map<celoe>> docs_models, int alpha=1)
-    {
-        this->docs_models_.assign(docs_models.begin(), docs_models.end());
-        this->docs_sizes_ = get_docs_sizes(docs_models);
-
-        this->default_alpha_ = alpha;
-
-        this->ref_voc_ = get_ref_voc(docs_models);
-        this->ref_voc_size_ = get_ref_voc_size(this->ref_voc_);
-
-        cout << this->ref_voc_size_ << endl;
-    }
-
-    double minka_fpi(double eps=1e-3, int max_iters=100){
-        std::map<std::string, double> alpha_m;
-
-        // create initial alpa_m vector
-        for (auto word: ref_voc_){
-            alpha_m[word.first] = default_alpha_ * word.second / ref_voc_size_;
-        }
-
-        // stoping criteria for the whole vector alpha_m
-        int vector_iteration = 0;
-        bool all_optimal;
-
-        while (vector_iteration <= max_iters && !all_optimal){
-            all_optimal = true;
-            std::string word_k;
-            double alpha_m_k, alpha_k, alpha_m_k_new;
-
-            cout << vector_iteration << endl;
-
-            for (auto alpha_m_iter: alpha_m){
-                word_k = alpha_m_iter.first;
-                alpha_m_k = alpha_m_iter.second;
-
-                alpha_k = alpha_m_k / ((double)ref_voc_[word_k] / ref_voc_size_);
-
-                // make a step and find new alpha_m_k
-                alpha_m_k_new = minka_fpi_step(word_k, alpha_k, alpha_m_k);
-
-                if (std::abs(alpha_m_k - alpha_m_k_new) > eps){
-                    all_optimal = false;
-
-                    alpha_m[word_k] = alpha_m_k_new;
-                }
-            }
-
-            vector_iteration++;
-        }
-
-        cout << endl << "Alpha_m for each word:" << endl;
-
-        double optimal_alpha = 0;
-        for (auto alpha_m_iter: alpha_m){
-            cout << alpha_m_iter.first << " " << alpha_m_iter.second << std::endl;
-            optimal_alpha += alpha_m_iter.second;
-        }
-
-        return optimal_alpha;
-    }
-
-    double minka_newton(){
-        // todo
-    }
-
-    double minka_lou(){
-        // todo
-    }
-
-private:
-    double minka_fpi_step(std::string word_k, double alpha_k, double alpha_m_k){
-        double nom = 0, denom = 0;
-        double alpha_m_k_dig = digamma(alpha_m_k),
-                alpha_k_dig = digamma(alpha_k);
-
-        long all_words_count, k_words_count;
-
-        for (int d = 0; d < docs_models_.size(); d++){
-
-            nom += digamma(docs_models_[d][word_k] + alpha_m_k) - alpha_m_k_dig;
-
-            denom += digamma(docs_sizes_[d] + alpha_k) - alpha_k_dig;
-
-        }
-
-        double alpha_m_k_new = alpha_m_k * nom / denom;;
-
-        cout << word_k << " " << alpha_k << " " << alpha_m_k << " " << alpha_m_k / ((double)ref_voc_[word_k] / ref_voc_size_) << " " << alpha_m_k_new << " " << nom << " " << denom << endl;
-
-        return alpha_m_k_new;
-    }
-
-    double minka_newton_iters();
-    double minka_lou_iters();
-
-    std::vector<feature_map<celoe>> docs_models_;
-    std::vector<long> docs_sizes_;
-
-    feature_map<celoe> ref_voc_;
-    celoe ref_voc_size_;
-
-    double default_alpha_;
-};
-}
-}
-}
-
-#endif // OPTIMIZATION_H
diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp
index 07536afbe..76230043e 100644
--- a/src/index/ranker/dirichlet_prior.cpp
+++ b/src/index/ranker/dirichlet_prior.cpp
@@ -33,7 +33,7 @@ void dirichlet_prior::save(std::ostream& out) const
     io::packed::write(out, mu_);
 }
 
-float dirichlet_prior::smoothed_prob(const score_data& sd) const
+float dirichlet_prior::smoothed_prob(score_data& sd)
 {
     float pc = static_cast<float>(sd.corpus_term_count) / sd.total_terms;
     float numerator = sd.doc_term_count + mu_ * pc;

From 54d727205163a07f583694f3582a8adb422337c7 Mon Sep 17 00:00:00 2001
From: M <makkolts@gmail.com>
Date: Wed, 29 Nov 2017 17:23:44 +0300
Subject: [PATCH 08/30] Deletion of previous stuff

---
 src/stats/optimization.cpp | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 src/stats/optimization.cpp

diff --git a/src/stats/optimization.cpp b/src/stats/optimization.cpp
deleted file mode 100644
index 87a7f3af4..000000000
--- a/src/stats/optimization.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "meta/stats/optimization.h"

From 65851899819645fa5cd33f4f560ef192e1b87441 Mon Sep 17 00:00:00 2001
From: M <makkolts@gmail.com>
Date: Wed, 29 Nov 2017 18:54:57 +0300
Subject: [PATCH 09/30] Test for dirichlet optimizations

---
 include/meta/index/ranker/dirichlet_prior.h | 21 ++++++++++++++-------
 src/index/ranker/CMakeLists.txt             |  2 ++
 src/index/ranker/dirichlet_prior.cpp        |  2 +-
 src/index/ranker/test_opt/CMakeLists.txt    |  8 ++++++++
 src/index/ranker/test_opt/test.cpp          | 12 ++++++++++++
 src/stats/CMakeLists.txt                    |  2 +-
 6 files changed, 38 insertions(+), 9 deletions(-)
 create mode 100644 src/index/ranker/test_opt/CMakeLists.txt
 create mode 100644 src/index/ranker/test_opt/test.cpp

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index 37ff18504..0f3bc3bc7 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -72,26 +72,33 @@ class dirichlet_prior : public language_model_ranker
 };
 
 class dirichlet_prior_opt : public dirichlet_prior{
-    void rank(ranker_context &ctx, uint64_t num_results, const filter_function_type &filter) const override{
+//    void rank(ranker_context &ctx, uint64_t num_results, const filter_function_type &filter) const override{
+//        ranking_function::rank(ctx, num_results, filter);
+//    }
+    template <class ForwardIterator>
+    std::vector<search_result> score(inverted_index& idx, ForwardIterator begin,
+                                     ForwardIterator end,
+                                     uint64_t num_results = 10)
+    {
         // optimize mu according to ranker_context before ranking
-        this->optimize_mu(ctx);
+        this->optimize_mu(idx);
 
-        ranking_function::rank(ctx, num_results, filter);
+        return ranker::score(idx, begin, end, num_results);
     }
 
-    virtual void optimize_mu(const ranker_context &ctx) = 0;
+    virtual void optimize_mu(const inverted_index& idx) = 0;
 };
 
 class digamma_rec: public dirichlet_prior_opt{
-    void optimize_mu(const ranker_context &ctx) override;
+    void optimize_mu(const inverted_index& idx) override;
 };
 
 class log_approx: public dirichlet_prior_opt{
-    void optimize_mu(const ranker_context &ctx) override;
+    void optimize_mu(const inverted_index& idx) override;
 };
 
 class mackay_peto: public dirichlet_prior_opt{
-    void optimize_mu(const ranker_context &ctx) override;
+    void optimize_mu(const inverted_index& idx) override;
 };
 
 /**
diff --git a/src/index/ranker/CMakeLists.txt b/src/index/ranker/CMakeLists.txt
index 20518f751..e386d54b6 100644
--- a/src/index/ranker/CMakeLists.txt
+++ b/src/index/ranker/CMakeLists.txt
@@ -1,5 +1,7 @@
 project(meta-ranker)
 
+add_subdirectory(test_opt)
+
 add_library(meta-ranker absolute_discount.cpp
                         dirichlet_prior.cpp
                         jelinek_mercer.cpp
diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp
index 76230043e..07536afbe 100644
--- a/src/index/ranker/dirichlet_prior.cpp
+++ b/src/index/ranker/dirichlet_prior.cpp
@@ -33,7 +33,7 @@ void dirichlet_prior::save(std::ostream& out) const
     io::packed::write(out, mu_);
 }
 
-float dirichlet_prior::smoothed_prob(score_data& sd)
+float dirichlet_prior::smoothed_prob(const score_data& sd) const
 {
     float pc = static_cast<float>(sd.corpus_term_count) / sd.total_terms;
     float numerator = sd.doc_term_count + mu_ * pc;
diff --git a/src/index/ranker/test_opt/CMakeLists.txt b/src/index/ranker/test_opt/CMakeLists.txt
new file mode 100644
index 000000000..e03d4f7f9
--- /dev/null
+++ b/src/index/ranker/test_opt/CMakeLists.txt
@@ -0,0 +1,8 @@
+project(meta-dirichlet-test)
+
+include_directories(../../../../include)
+
+add_executable(test_opt test.cpp)
+
+target_link_libraries(test_opt meta-ranker)
+
diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
new file mode 100644
index 000000000..e48e078a8
--- /dev/null
+++ b/src/index/ranker/test_opt/test.cpp
@@ -0,0 +1,12 @@
+#include "meta/corpus/document.h"
+#include "meta/index/ranker/all.h"
+#include "meta/index/forward_index.h"
+
+#include <iostream>
+
+
+int main(){
+
+  std::cout << "Quaia!" << std::endl;
+  meta::index::dirichlet_prior ranker;
+}
diff --git a/src/stats/CMakeLists.txt b/src/stats/CMakeLists.txt
index 3e09824f2..c9872741c 100644
--- a/src/stats/CMakeLists.txt
+++ b/src/stats/CMakeLists.txt
@@ -1,6 +1,6 @@
 project(meta-stats)
 
-add_library(meta-stats running_stats.cpp optimization.cpp)
+add_library(meta-stats running_stats.cpp)
 target_link_libraries(meta-stats meta-definitions)
 
 install(TARGETS meta-stats

From c0a357cf363163735fac893fd7de62031dda1d28 Mon Sep 17 00:00:00 2001
From: M <makkolts@gmail.com>
Date: Wed, 29 Nov 2017 20:38:59 +0300
Subject: [PATCH 10/30] Private/public methods

---
 include/meta/index/ranker/dirichlet_prior.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index 0f3bc3bc7..63f44fb2c 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -75,6 +75,7 @@ class dirichlet_prior_opt : public dirichlet_prior{
 //    void rank(ranker_context &ctx, uint64_t num_results, const filter_function_type &filter) const override{
 //        ranking_function::rank(ctx, num_results, filter);
 //    }
+public:
     template <class ForwardIterator>
     std::vector<search_result> score(inverted_index& idx, ForwardIterator begin,
                                      ForwardIterator end,
@@ -85,7 +86,7 @@ class dirichlet_prior_opt : public dirichlet_prior{
 
         return ranker::score(idx, begin, end, num_results);
     }
-
+private:
     virtual void optimize_mu(const inverted_index& idx) = 0;
 };
 

From 76d32aed62081f68e675f698359ece9976fb547d Mon Sep 17 00:00:00 2001
From: Aleksey <alex2304el@gmail.com>
Date: Wed, 29 Nov 2017 20:56:14 +0300
Subject: [PATCH 11/30] [opt] test indexes

---
 src/index/ranker/test_opt/test.cpp | 44 ++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
index e48e078a8..c6dc0de61 100644
--- a/src/index/ranker/test_opt/test.cpp
+++ b/src/index/ranker/test_opt/test.cpp
@@ -4,9 +4,47 @@
 
 #include <iostream>
 
+#include "meta/index/inverted_index.h"
+#include "meta/logging/logger.h"
+#include "meta/parser/analyzers/tree_analyzer.h"
+#include "meta/sequence/analyzers/ngram_pos_analyzer.h"
+#include "meta/util/time.h"
 
-int main(){
+using namespace meta;
 
-  std::cout << "Quaia!" << std::endl;
-  meta::index::dirichlet_prior ranker;
+
+int main(int argc, char* argv[])
+{
+    if (argc != 2)
+    {
+        std::cerr << "Usage:\t" << argv[0] << " configFile" << std::endl;
+        return 1;
+    }
+
+    // Turn on logging to std::cerr.
+    logging::set_cerr_logging();
+
+    // Register additional analyzers
+    parser::register_analyzers();
+    sequence::register_analyzers();
+
+    // Time how long it takes to create the index. By default, common::time's
+    //  unit of measurement is milliseconds.
+    auto time = common::time([&]()
+    {
+        // Creates an inverted index with no cache. We don't need a cache here
+        //  since we're never searching the index, only building it.
+        auto config = cpptoml::parse_file(argv[1]);
+        auto idx = index::make_index<index::inverted_index>(*config);
+
+        // Print out some data about the corpus.
+        std::cout << "Number of documents: " << idx->num_docs() << std::endl;
+        std::cout << "Avg Doc Length: " << idx->avg_doc_length() << std::endl;
+        std::cout << "Unique Terms: " << idx->unique_terms() << std::endl;
+    });
+
+    std::cout << "Index generation took: " << time.count() / 1000.0
+              << " seconds" << std::endl;
+
+    return 0;
 }

From 4ccda58e02e2c50e5efaff27d3609056334a8fed Mon Sep 17 00:00:00 2001
From: M <makkolts@gmail.com>
Date: Wed, 29 Nov 2017 23:15:35 +0300
Subject: [PATCH 12/30] Interface for methods

---
 include/meta/index/ranker/dirichlet_prior.h | 20 ++++++++++++--------
 src/index/ranker/test_opt/CMakeLists.txt    |  4 +++-
 src/index/ranker/test_opt/test.cpp          |  9 ++++-----
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index 63f44fb2c..ecd3b0da0 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -66,15 +66,13 @@ class dirichlet_prior : public language_model_ranker
      */
     float doc_constant(const score_data& sd) const override;
 
-  private:
+  protected:
     /// the Dirichlet prior parameter
-    const float mu_;
+//    const float mu_;
+    float mu_;
 };
 
 class dirichlet_prior_opt : public dirichlet_prior{
-//    void rank(ranker_context &ctx, uint64_t num_results, const filter_function_type &filter) const override{
-//        ranking_function::rank(ctx, num_results, filter);
-//    }
 public:
     template <class ForwardIterator>
     std::vector<search_result> score(inverted_index& idx, ForwardIterator begin,
@@ -86,20 +84,26 @@ class dirichlet_prior_opt : public dirichlet_prior{
 
         return ranker::score(idx, begin, end, num_results);
     }
+
+    float get_optimized_mu(const inverted_index& idx) {
+        optimize_mu(idx);
+        return mu_;
+    }
+
 private:
     virtual void optimize_mu(const inverted_index& idx) = 0;
 };
 
 class digamma_rec: public dirichlet_prior_opt{
-    void optimize_mu(const inverted_index& idx) override;
+    void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
 };
 
 class log_approx: public dirichlet_prior_opt{
-    void optimize_mu(const inverted_index& idx) override;
+    void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
 };
 
 class mackay_peto: public dirichlet_prior_opt{
-    void optimize_mu(const inverted_index& idx) override;
+    void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
 };
 
 /**
diff --git a/src/index/ranker/test_opt/CMakeLists.txt b/src/index/ranker/test_opt/CMakeLists.txt
index e03d4f7f9..1e8cdc0ee 100644
--- a/src/index/ranker/test_opt/CMakeLists.txt
+++ b/src/index/ranker/test_opt/CMakeLists.txt
@@ -4,5 +4,7 @@ include_directories(../../../../include)
 
 add_executable(test_opt test.cpp)
 
-target_link_libraries(test_opt meta-ranker)
+target_link_libraries(test_opt meta-ranker
+    meta-sequence-analyzers
+    meta-parser-analyzers)
 
diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
index c6dc0de61..7fc4e5e0b 100644
--- a/src/index/ranker/test_opt/test.cpp
+++ b/src/index/ranker/test_opt/test.cpp
@@ -37,13 +37,12 @@ int main(int argc, char* argv[])
         auto config = cpptoml::parse_file(argv[1]);
         auto idx = index::make_index<index::inverted_index>(*config);
 
-        // Print out some data about the corpus.
-        std::cout << "Number of documents: " << idx->num_docs() << std::endl;
-        std::cout << "Avg Doc Length: " << idx->avg_doc_length() << std::endl;
-        std::cout << "Unique Terms: " << idx->unique_terms() << std::endl;
+        // Create and make score of optimizer
+        index::digamma_rec ranker;
+        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
     });
 
-    std::cout << "Index generation took: " << time.count() / 1000.0
+    std::cout << "Method took: " << time.count() / 1000.0
               << " seconds" << std::endl;
 
     return 0;

From 248c1515dee3c7023228e27d92ab6d706344e03e Mon Sep 17 00:00:00 2001
From: M <makkolts@gmail.com>
Date: Thu, 30 Nov 2017 00:22:50 +0300
Subject: [PATCH 13/30] Refactoring of optimization interface

---
 include/meta/index/ranker/dirichlet_prior.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index ecd3b0da0..b53bd86ff 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -86,11 +86,16 @@ class dirichlet_prior_opt : public dirichlet_prior{
     }
 
     float get_optimized_mu(const inverted_index& idx) {
-        optimize_mu(idx);
+        optimize(idx);
         return mu_;
     }
 
 private:
+    void optimize(const inverted_index& idx) {
+        doc_id y = idx.docs()[0];
+
+    }
+
     virtual void optimize_mu(const inverted_index& idx) = 0;
 };
 

From 61ece78e0c0b640383a25428e4b4c72852950007 Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Thu, 30 Nov 2017 00:24:15 +0300
Subject: [PATCH 14/30] [opt] tmp for merge

---
 include/meta/index/ranker/dirichlet_prior.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index ecd3b0da0..a9651607c 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -91,7 +91,12 @@ class dirichlet_prior_opt : public dirichlet_prior{
     }
 
 private:
-    virtual void optimize_mu(const inverted_index& idx) = 0;
+     void optimize_mu(const inverted_index& idx){
+         // TODO: parse idx
+         idx.term_freq;
+         idx.doc_size;
+         idx.
+     }
 };
 
 class digamma_rec: public dirichlet_prior_opt{

From f9792648e6b67c58a280b10bfe907da478aa5cc8 Mon Sep 17 00:00:00 2001
From: M <makkolts@gmail.com>
Date: Thu, 30 Nov 2017 01:53:08 +0300
Subject: [PATCH 15/30] Tests for all functions at same time

---
 include/meta/index/ranker/dirichlet_prior.h |  1 -
 src/index/ranker/test_opt/test.cpp          | 32 +++++++++++++++++----
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index b53bd86ff..c7d3dc9bf 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -92,7 +92,6 @@ class dirichlet_prior_opt : public dirichlet_prior{
 
 private:
     void optimize(const inverted_index& idx) {
-        doc_id y = idx.docs()[0];
 
     }
 
diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
index 7fc4e5e0b..b85fdbfca 100644
--- a/src/index/ranker/test_opt/test.cpp
+++ b/src/index/ranker/test_opt/test.cpp
@@ -28,21 +28,41 @@ int main(int argc, char* argv[])
     parser::register_analyzers();
     sequence::register_analyzers();
 
+    // Creates an inverted index with no cache. We don't need a cache here
+    //  since we're never searching the index, only building it.
+    auto config = cpptoml::parse_file(argv[1]);
+    auto idx = index::make_index<index::inverted_index>(*config);
+
     // Time how long it takes to create the index. By default, common::time's
     //  unit of measurement is milliseconds.
     auto time = common::time([&]()
     {
-        // Creates an inverted index with no cache. We don't need a cache here
-        //  since we're never searching the index, only building it.
-        auto config = cpptoml::parse_file(argv[1]);
-        auto idx = index::make_index<index::inverted_index>(*config);
-
         // Create and make score of optimizer
         index::digamma_rec ranker;
         std::cout << ranker.get_optimized_mu(*idx) << std::endl;
     });
 
-    std::cout << "Method took: " << time.count() / 1000.0
+    std::cout << "Method DR took: " << time.count() / 1000.0
+              << " seconds" << std::endl;
+
+    time = common::time([&]()
+    {
+        // Create and make score of optimizer
+        index::log_approx ranker;
+        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
+    });
+
+    std::cout << "Method LA took: " << time.count() / 1000.0
+              << " seconds" << std::endl;
+
+    time = common::time([&]()
+    {
+        // Create and make score of optimizer
+        index::mackay_peto ranker;
+        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
+    });
+
+    std::cout << "Method MP took: " << time.count() / 1000.0
               << " seconds" << std::endl;
 
     return 0;

From ed475b5339e7dd8dd4012a55f19e9948aaf26a1c Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Thu, 30 Nov 2017 04:23:07 +0300
Subject: [PATCH 16/30] [opt] + first method without testing

---
 include/meta/index/ranker/dirichlet_prior.h | 135 ++++++++++++++++++--
 1 file changed, 124 insertions(+), 11 deletions(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index f0a236c6b..a11a0a2fe 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -74,6 +74,32 @@ class dirichlet_prior : public language_model_ranker
     float mu_;
 };
 
+struct docs_data
+{
+    // general info
+
+    inverted_index& idx;
+    /// ids of all documents
+    std::vector<doc_id> doc_ids;
+    /// ids of all terms
+    std::vector<term_id> term_ids;
+
+    /**
+     * Constructor to initialize most elements.
+     * @param p_idx The index that is being used
+     * @param p_doc_ids ids of all docs
+     * @param p_term_ids ids of all terms
+     */
+    score_data(inverted_index& p_idx, std::vector<doc_id> p_doc_ids, std::vector<doc_id> p_term_ids,
+               uint64_t p_total_terms, float p_query_length)
+        : idx(p_idx), // gcc no non-const ref init from brace init list
+          doc_ids{p_doc_ids},
+          term_ids{p_term_ids}
+    {
+        /* nothing */
+    }
+};
+
 class dirichlet_prior_opt : public dirichlet_prior{
 public:
     template <class ForwardIterator>
@@ -88,34 +114,121 @@ class dirichlet_prior_opt : public dirichlet_prior{
     }
 
     float get_optimized_mu(const inverted_index& idx) {
-        optimize(idx);
+        optimize_mu(idx);
+
         return mu_;
     }
 
 private:
-    void optimize(const inverted_index& idx) {
-        // TODO: parse idx
+    void optimize_mu(const inverted_index& idx) {
         auto docs_ids = idx.docs();
         auto terms_ids = idx.terms();
+        docs_data  dd{idx, docs_ids, terms_ids};
 
-        std::cout << idx.unique_terms() << std::endl;
+        optimize_mu(dd);
+//        std::cout << idx.unique_terms() << std::endl;
 
-        for (auto d_id: docs_ids){
-            for (auto t_id: terms_ids){
-                std::cout << idx.term_freq(t_id, d_id) << std::endl;
-            }
-        }
+//        for (auto d_id: docs_ids){
+//            for (auto t_id: terms_ids){
+//                std::cout << idx.term_freq(t_id, d_id) << std::endl;
+//            }
+//        }
 
+//        optimize_mu(std::vector<doc_id> docs_ids,
 //        idx.unique_terms()
 //        idx.total_corpus_terms()
 
     }
 
-    virtual void optimize_mu(const inverted_index& idx) = 0;
+    virtual void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) = 0;
 };
 
+// # TODO: choose template type instead of long
+typedef long count_d;
+
 class digamma_rec: public dirichlet_prior_opt{
-    void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
+    void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override {
+        // fill C_.(n) and C_k(n)
+
+        std::map<count_d, count_d> docs_counts;
+        std::map<term_id, std::map<count_d, count_d>> terms_docs_counts;
+        long doc_size, doc_term_freq;
+
+        for (auto d_id: dd.doc_ids){
+            doc_size = dd.idx.doc_size(d_id);
+
+            //// increase number of docs with the given size (C_.(n))
+            docs_counts[doc_size] += 1;
+
+            for (auto t_id: dd.idx.terms(d_id)){
+                doc_term_freq = dd.idx.term_freq(t_id, d_id);
+
+                //// increase number of docs with the given count of word t_id (C_k(n))
+                terms_docs_counts[t_id][doc_term_freq] += 1;
+            }
+        }
+
+//        // sort by ascending of occurences
+//        std::sort(docs_counts.begin(), items.end());
+//        for (auto key: terms_docs_counts){
+//            std::sort(key.second.begin(), key.second.end());
+//        }
+
+        // p(w|REF) = dd.idx.total_num_occurences(t_id)
+
+        // fill start vector alpha_m
+        double alpha = 1;
+        std::map<term_id, double> alpha_m;
+
+        for (auto t_id: dd.idx.terms()){
+            alpha_m[t_id] = dd.idx.total_num_occurences(t_id) * alpha;
+        }
+
+        double D, S;
+        bool converged = false;
+
+        while (!converged){
+            D = 0;
+            S = 0;
+
+            alpha = 0;
+            for (auto alpha_m_k: alpha_m){
+                alpha += alpha_m_k;
+            }
+
+            count_d n, c_d;
+            for (auto kv: docs_counts){
+                n = kv.first;
+                c_d = kv.second;
+
+                D += 1/(n - 1 + alpha);
+                S += c_d * D;
+            }
+
+            std::map<count_d, count_d> c_n;
+            term_id k;
+            double S_k;
+            for (auto kv: terms_docs_counts){
+                k = kv.first;
+                c_n = kv.second;
+
+                D = 0;
+                S_k = 0;
+
+                count_d n, c_k_n;
+                for (auto kv_: c_n){
+                    n = kv_.first;
+                    c_k_n = kv_.second;
+
+                    D += 1/(n - 1 + alpha * m_k);
+                    S_k += c_k_n * D;
+                }
+
+                alpha_m[k] *= S_k / S;
+            }
+        }
+
+    }
 };
 
 class log_approx: public dirichlet_prior_opt{

From 4528ec667f59e9900fd64a0b283357be86e05806 Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Thu, 30 Nov 2017 04:43:19 +0300
Subject: [PATCH 17/30] [opt] *first method builds

---
 include/meta/index/ranker/dirichlet_prior.h | 46 +++++++++++++++------
 src/index/ranker/test_opt/test.cpp          | 28 ++++++-------
 2 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index a11a0a2fe..702d28638 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -12,6 +12,7 @@
 #include "meta/index/ranker/lm_ranker.h"
 #include "meta/index/ranker/ranker_factory.h"
 
+#include <cmath>
 #include <iostream>
 
 namespace meta
@@ -78,7 +79,7 @@ struct docs_data
 {
     // general info
 
-    inverted_index& idx;
+    const inverted_index& idx;
     /// ids of all documents
     std::vector<doc_id> doc_ids;
     /// ids of all terms
@@ -90,8 +91,7 @@ struct docs_data
      * @param p_doc_ids ids of all docs
      * @param p_term_ids ids of all terms
      */
-    score_data(inverted_index& p_idx, std::vector<doc_id> p_doc_ids, std::vector<doc_id> p_term_ids,
-               uint64_t p_total_terms, float p_query_length)
+    docs_data(const inverted_index& p_idx, std::vector<doc_id> p_doc_ids, std::vector<term_id> p_term_ids)
         : idx(p_idx), // gcc no non-const ref init from brace init list
           doc_ids{p_doc_ids},
           term_ids{p_term_ids}
@@ -119,11 +119,22 @@ class dirichlet_prior_opt : public dirichlet_prior{
         return mu_;
     }
 
+protected:
+    inline double get_alpha(std::map<term_id, double> alpha_m){
+        double alpha = 0;
+
+        for (auto alpha_m_k: alpha_m){
+            alpha += alpha_m_k.second;
+        }
+
+        return alpha;
+    }
+
 private:
     void optimize_mu(const inverted_index& idx) {
         auto docs_ids = idx.docs();
         auto terms_ids = idx.terms();
-        docs_data  dd{idx, docs_ids, terms_ids};
+        docs_data dd{idx, docs_ids, terms_ids};
 
         optimize_mu(dd);
 //        std::cout << idx.unique_terms() << std::endl;
@@ -177,7 +188,7 @@ class digamma_rec: public dirichlet_prior_opt{
         // p(w|REF) = dd.idx.total_num_occurences(t_id)
 
         // fill start vector alpha_m
-        double alpha = 1;
+        double alpha = 1, alpha_mk_new;
         std::map<term_id, double> alpha_m;
 
         for (auto t_id: dd.idx.terms()){
@@ -185,16 +196,15 @@ class digamma_rec: public dirichlet_prior_opt{
         }
 
         double D, S;
-        bool converged = false;
+        bool all_optimized = false;
+        int iters_num = 0;
 
-        while (!converged){
+        while (!all_optimized && iters_num < max_iter){
             D = 0;
             S = 0;
+            all_optimized = true;
 
-            alpha = 0;
-            for (auto alpha_m_k: alpha_m){
-                alpha += alpha_m_k;
-            }
+            alpha = get_alpha(alpha_m);
 
             count_d n, c_d;
             for (auto kv: docs_counts){
@@ -220,14 +230,24 @@ class digamma_rec: public dirichlet_prior_opt{
                     n = kv_.first;
                     c_k_n = kv_.second;
 
-                    D += 1/(n - 1 + alpha * m_k);
+                    D += 1/(n - 1 + alpha_m[k]);
                     S_k += c_k_n * D;
                 }
 
-                alpha_m[k] *= S_k / S;
+                alpha_mk_new = alpha_m[k] * S_k / S;
+
+                if (std::abs(alpha_mk_new - alpha_m[k]) > eps){
+                    all_optimized = false;
+                }
+
+                alpha_m[k] *= alpha_mk_new;
             }
+
+            iters_num++;
         }
 
+        mu_ = get_alpha(alpha_m);
+
     }
 };
 
diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
index b85fdbfca..756ae45cf 100644
--- a/src/index/ranker/test_opt/test.cpp
+++ b/src/index/ranker/test_opt/test.cpp
@@ -45,22 +45,22 @@ int main(int argc, char* argv[])
     std::cout << "Method DR took: " << time.count() / 1000.0
               << " seconds" << std::endl;
 
-    time = common::time([&]()
-    {
-        // Create and make score of optimizer
-        index::log_approx ranker;
-        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
-    });
+//    time = common::time([&]()
+//    {
+//        // Create and make score of optimizer
+//        index::log_approx ranker;
+//        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
+//    });
 
-    std::cout << "Method LA took: " << time.count() / 1000.0
-              << " seconds" << std::endl;
+//    std::cout << "Method LA took: " << time.count() / 1000.0
+//              << " seconds" << std::endl;
 
-    time = common::time([&]()
-    {
-        // Create and make score of optimizer
-        index::mackay_peto ranker;
-        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
-    });
+//    time = common::time([&]()
+//    {
+//        // Create and make score of optimizer
+//        index::mackay_peto ranker;
+//        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
+//    });
 
     std::cout << "Method MP took: " << time.count() / 1000.0
               << " seconds" << std::endl;

From 312a485f45c1d64012e4de3ed8f401b6551d9199 Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Thu, 30 Nov 2017 05:27:03 +0300
Subject: [PATCH 18/30] [opt] * method works

---
 include/meta/index/ranker/dirichlet_prior.h | 52 ++++++++++++++-------
 src/index/ranker/test_opt/test.cpp          |  4 +-
 2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index 702d28638..1f914950f 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -15,6 +15,8 @@
 #include <cmath>
 #include <iostream>
 
+using namespace std;
+
 namespace meta
 {
 namespace index
@@ -75,6 +77,10 @@ class dirichlet_prior : public language_model_ranker
     float mu_;
 };
 
+
+// # TODO: choose template type instead of long
+typedef long count_d;
+
 struct docs_data
 {
     // general info
@@ -84,6 +90,8 @@ struct docs_data
     std::vector<doc_id> doc_ids;
     /// ids of all terms
     std::vector<term_id> term_ids;
+    /// total size of documents
+    count_d ref_size;
 
     /**
      * Constructor to initialize most elements.
@@ -91,10 +99,11 @@ struct docs_data
      * @param p_doc_ids ids of all docs
      * @param p_term_ids ids of all terms
      */
-    docs_data(const inverted_index& p_idx, std::vector<doc_id> p_doc_ids, std::vector<term_id> p_term_ids)
+    docs_data(const inverted_index& p_idx, std::vector<doc_id> p_doc_ids, std::vector<term_id> p_term_ids, count_d p_ref_size)
         : idx(p_idx), // gcc no non-const ref init from brace init list
           doc_ids{p_doc_ids},
-          term_ids{p_term_ids}
+          term_ids{p_term_ids},
+          ref_size{p_ref_size}
     {
         /* nothing */
     }
@@ -113,8 +122,8 @@ class dirichlet_prior_opt : public dirichlet_prior{
         return ranker::score(idx, begin, end, num_results);
     }
 
-    float get_optimized_mu(const inverted_index& idx) {
-        optimize_mu(idx);
+    float get_optimized_mu(const inverted_index& idx, float eps=1e-6, int max_iter=100) {
+        optimize_mu(idx, eps, max_iter);
 
         return mu_;
     }
@@ -131,12 +140,17 @@ class dirichlet_prior_opt : public dirichlet_prior{
     }
 
 private:
-    void optimize_mu(const inverted_index& idx) {
+    void optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=100) {
         auto docs_ids = idx.docs();
         auto terms_ids = idx.terms();
-        docs_data dd{idx, docs_ids, terms_ids};
 
-        optimize_mu(dd);
+        count_d ref_size = 0;
+        for (auto& id : docs_ids)
+            ref_size += idx.doc_size(id);
+
+        docs_data dd{idx, docs_ids, terms_ids, ref_size};
+
+        optimize_mu(dd, eps, max_iter);
 //        std::cout << idx.unique_terms() << std::endl;
 
 //        for (auto d_id: docs_ids){
@@ -154,10 +168,7 @@ class dirichlet_prior_opt : public dirichlet_prior{
     virtual void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) = 0;
 };
 
-// # TODO: choose template type instead of long
-typedef long count_d;
-
-class digamma_rec: public dirichlet_prior_opt{
+class digamma_rec_opt: public dirichlet_prior_opt{
     void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override {
         // fill C_.(n) and C_k(n)
 
@@ -191,21 +202,25 @@ class digamma_rec: public dirichlet_prior_opt{
         double alpha = 1, alpha_mk_new;
         std::map<term_id, double> alpha_m;
 
+        cout << "Start alpha: ";
         for (auto t_id: dd.idx.terms()){
             alpha_m[t_id] = dd.idx.total_num_occurences(t_id) * alpha;
+            alpha_m[t_id] /= (double)dd.ref_size;
+            cout << alpha_m[t_id] << " ";
         }
 
         double D, S;
         bool all_optimized = false;
-        int iters_num = 0;
+        int iter_num = 0;
 
-        while (!all_optimized && iters_num < max_iter){
+        while (!all_optimized && iter_num < max_iter){
             D = 0;
             S = 0;
             all_optimized = true;
 
             alpha = get_alpha(alpha_m);
 
+            cout << "\nIter " << iter_num << " alpha = " << alpha;
             count_d n, c_d;
             for (auto kv: docs_counts){
                 n = kv.first;
@@ -243,7 +258,12 @@ class digamma_rec: public dirichlet_prior_opt{
                 alpha_m[k] *= alpha_mk_new;
             }
 
-            iters_num++;
+            cout << "\nVector alpha_m after the iter: ";
+            for (auto kv: alpha_m){
+                cout << " " << kv.second;
+            }
+
+            iter_num++;
         }
 
         mu_ = get_alpha(alpha_m);
@@ -251,11 +271,11 @@ class digamma_rec: public dirichlet_prior_opt{
     }
 };
 
-class log_approx: public dirichlet_prior_opt{
+class log_approx_opt: public dirichlet_prior_opt{
 //    void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
 };
 
-class mackay_peto: public dirichlet_prior_opt{
+class mackay_peto_opt: public dirichlet_prior_opt{
 //    void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
 };
 
diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
index 756ae45cf..ff2afc130 100644
--- a/src/index/ranker/test_opt/test.cpp
+++ b/src/index/ranker/test_opt/test.cpp
@@ -38,8 +38,8 @@ int main(int argc, char* argv[])
     auto time = common::time([&]()
     {
         // Create and make score of optimizer
-        index::digamma_rec ranker;
-        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
+        index::digamma_rec_opt ranker;
+        std::cout << ranker.get_optimized_mu(*idx, 1e-6, 100) << std::endl;
     });
 
     std::cout << "Method DR took: " << time.count() / 1000.0

From b60cc54a7549120280ad63855d17136fc1003d32 Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Thu, 30 Nov 2017 18:20:00 +0300
Subject: [PATCH 19/30] [opt] *first method debugged

---
 include/meta/index/ranker/dirichlet_prior.h | 81 ++++++++++++++-------
 src/index/ranker/test_opt/test.cpp          |  4 +-
 2 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index 1f914950f..1133ad941 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -168,7 +168,7 @@ class dirichlet_prior_opt : public dirichlet_prior{
     virtual void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) = 0;
 };
 
-class digamma_rec_opt: public dirichlet_prior_opt{
+class dirichlet_digamma_rec: public dirichlet_prior_opt{
     void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override {
         // fill C_.(n) and C_k(n)
 
@@ -176,20 +176,45 @@ class digamma_rec_opt: public dirichlet_prior_opt{
         std::map<term_id, std::map<count_d, count_d>> terms_docs_counts;
         long doc_size, doc_term_freq;
 
+        cout << "Docs and terms:\n";
         for (auto d_id: dd.doc_ids){
             doc_size = dd.idx.doc_size(d_id);
 
             //// increase number of docs with the given size (C_.(n))
             docs_counts[doc_size] += 1;
 
-            for (auto t_id: dd.idx.terms(d_id)){
+            cout << d_id << " " << doc_size << " " << docs_counts[doc_size] << endl;
+            for (auto t_id: dd.term_ids){
                 doc_term_freq = dd.idx.term_freq(t_id, d_id);
 
                 //// increase number of docs with the given count of word t_id (C_k(n))
                 terms_docs_counts[t_id][doc_term_freq] += 1;
+
+                cout << "    " << t_id << " " << doc_term_freq << " " << terms_docs_counts[t_id][doc_term_freq] << endl;
             }
         }
 
+        cout << "\nDocuments_ids count: " << dd.doc_ids.size() << "; Terms ids count: " << dd.term_ids.size() << endl;
+
+        cout << "\nDocuments sizes frequency:\n";
+        for (auto kv: docs_counts){
+            cout << kv.first << " " << kv.second << endl;
+        }
+
+        int occur_sum, freq_sum;
+        cout << "\nTerms frequency in each doc:\n";
+        for (auto kv: terms_docs_counts){
+            occur_sum = 0;
+            freq_sum = 0;
+            cout << dd.idx.total_num_occurences(kv.first) << " " << kv.first << endl;
+            for (auto kv_: kv.second){
+                occur_sum += kv_.second;
+                freq_sum += kv_.first * kv_.second;
+                cout << "   " << kv_.first << " " << kv_.second << endl;
+            }
+            cout << "    " << freq_sum << " " << occur_sum << " total occurences" << endl;
+        }
+
 //        // sort by ascending of occurences
 //        std::sort(docs_counts.begin(), items.end());
 //        for (auto key: terms_docs_counts){
@@ -199,11 +224,11 @@ class digamma_rec_opt: public dirichlet_prior_opt{
         // p(w|REF) = dd.idx.total_num_occurences(t_id)
 
         // fill start vector alpha_m
-        double alpha = 1, alpha_mk_new;
+        double alpha = 2000.0, alpha_mk_new;
         std::map<term_id, double> alpha_m;
 
-        cout << "Start alpha: ";
-        for (auto t_id: dd.idx.terms()){
+        cout << "\nStart alpha: ";
+        for (auto t_id: dd.term_ids){
             alpha_m[t_id] = dd.idx.total_num_occurences(t_id) * alpha;
             alpha_m[t_id] /= (double)dd.ref_size;
             cout << alpha_m[t_id] << " ";
@@ -213,49 +238,55 @@ class digamma_rec_opt: public dirichlet_prior_opt{
         bool all_optimized = false;
         int iter_num = 0;
 
+        double n_max = docs_counts.rbegin()->first;
+        cout << "\n n_max=" << n_max << endl;
+
         while (!all_optimized && iter_num < max_iter){
-            D = 0;
-            S = 0;
+            D = 0.0;
+            S = 0.0;
             all_optimized = true;
 
             alpha = get_alpha(alpha_m);
 
             cout << "\nIter " << iter_num << " alpha = " << alpha;
-            count_d n, c_d;
-            for (auto kv: docs_counts){
-                n = kv.first;
-                c_d = kv.second;
+            count_d c_d;
+            for (count_d n = 1; n <= n_max; n++){
+                c_d = docs_counts[n];
 
-                D += 1/(n - 1 + alpha);
+                D += 1.0/(n - 1 + alpha);
                 S += c_d * D;
             }
 
-            std::map<count_d, count_d> c_n;
+            cout << "\nD = " << D << "; S = " << S << "; S_k = ";
+
+            std::map<count_d, count_d> c_k;
             term_id k;
             double S_k;
             for (auto kv: terms_docs_counts){
                 k = kv.first;
-                c_n = kv.second;
+                c_k = kv.second;
 
-                D = 0;
-                S_k = 0;
+                D = 0.0;
+                S_k = 0.0;
 
-                count_d n, c_k_n;
-                for (auto kv_: c_n){
-                    n = kv_.first;
-                    c_k_n = kv_.second;
+                count_d c_k_n, n_k_max = c_k.rbegin()->first;
+                cout << "\n n_k_max=" << n_k_max << endl;
+                for (count_d n = 1; n <= n_k_max; n++){
+                    c_k_n = c_k[n];
 
-                    D += 1/(n - 1 + alpha_m[k]);
+                    D += 1.0/(n - 1 + alpha_m[k]);
                     S_k += c_k_n * D;
                 }
 
+                cout << S_k << " ";
+
                 alpha_mk_new = alpha_m[k] * S_k / S;
 
                 if (std::abs(alpha_mk_new - alpha_m[k]) > eps){
                     all_optimized = false;
                 }
 
-                alpha_m[k] *= alpha_mk_new;
+                alpha_m[k] = alpha_mk_new;
             }
 
             cout << "\nVector alpha_m after the iter: ";
@@ -267,15 +298,15 @@ class digamma_rec_opt: public dirichlet_prior_opt{
         }
 
         mu_ = get_alpha(alpha_m);
-
+        cout << endl << mu_ << endl;
     }
 };
 
-class log_approx_opt: public dirichlet_prior_opt{
+class dirichlet_log_approx: public dirichlet_prior_opt{
 //    void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
 };
 
-class mackay_peto_opt: public dirichlet_prior_opt{
+class dirichlet_mackay_peto: public dirichlet_prior_opt{
 //    void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
 };
 
diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
index ff2afc130..bd69cfba7 100644
--- a/src/index/ranker/test_opt/test.cpp
+++ b/src/index/ranker/test_opt/test.cpp
@@ -38,8 +38,8 @@ int main(int argc, char* argv[])
     auto time = common::time([&]()
     {
         // Create and make score of optimizer
-        index::digamma_rec_opt ranker;
-        std::cout << ranker.get_optimized_mu(*idx, 1e-6, 100) << std::endl;
+        index::dirichlet_digamma_rec ranker;
+        std::cout << "\n\n" << ranker.get_optimized_mu(*idx, 1e-6, 100) << std::endl;
     });
 
     std::cout << "Method DR took: " << time.count() / 1000.0

From 0a0851cc524926b83d68583068b176f4e463894b Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Thu, 30 Nov 2017 18:44:29 +0300
Subject: [PATCH 20/30] [opt] method refactored

---
 include/meta/index/ranker/dirichlet_prior.h | 136 +++++++-------------
 src/index/ranker/test_opt/test.cpp          |   2 +-
 2 files changed, 46 insertions(+), 92 deletions(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index 1133ad941..ef04fc681 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -13,9 +13,6 @@
 #include "meta/index/ranker/ranker_factory.h"
 
 #include <cmath>
-#include <iostream>
-
-using namespace std;
 
 namespace meta
 {
@@ -92,6 +89,12 @@ struct docs_data
     std::vector<term_id> term_ids;
     /// total size of documents
     count_d ref_size;
+    /// C_.(n)
+    std::map<count_d, count_d> docs_counts;
+    /// C_k(n)
+    std::map<term_id, std::map<count_d, count_d>> terms_docs_counts;
+    /// vector alpha_m
+    std::map<term_id, double> alpha_m;
 
     /**
      * Constructor to initialize most elements.
@@ -99,11 +102,16 @@ struct docs_data
      * @param p_doc_ids ids of all docs
      * @param p_term_ids ids of all terms
      */
-    docs_data(const inverted_index& p_idx, std::vector<doc_id> p_doc_ids, std::vector<term_id> p_term_ids, count_d p_ref_size)
+    docs_data(const inverted_index& p_idx, std::vector<doc_id> p_doc_ids, std::vector<term_id> p_term_ids, count_d p_ref_size,
+              std::map<count_d, count_d> p_docs_counts, std::map<term_id, std::map<count_d, count_d>> p_terms_docs_counts,
+              std::map<term_id, double> p_alpha_m)
         : idx(p_idx), // gcc no non-const ref init from brace init list
           doc_ids{p_doc_ids},
           term_ids{p_term_ids},
-          ref_size{p_ref_size}
+          ref_size{p_ref_size},
+          docs_counts{p_docs_counts},
+          terms_docs_counts{p_terms_docs_counts},
+          alpha_m{p_alpha_m}
     {
         /* nothing */
     }
@@ -122,7 +130,7 @@ class dirichlet_prior_opt : public dirichlet_prior{
         return ranker::score(idx, begin, end, num_results);
     }
 
-    float get_optimized_mu(const inverted_index& idx, float eps=1e-6, int max_iter=100) {
+    float get_optimized_mu(const inverted_index& idx, float eps, int max_iter) {
         optimize_mu(idx, eps, max_iter);
 
         return mu_;
@@ -140,106 +148,64 @@ class dirichlet_prior_opt : public dirichlet_prior{
     }
 
 private:
-    void optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=100) {
+    void optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) {
+        // parse idx and extract what we need
         auto docs_ids = idx.docs();
         auto terms_ids = idx.terms();
 
+        // calculate ref_size
         count_d ref_size = 0;
         for (auto& id : docs_ids)
             ref_size += idx.doc_size(id);
 
-        docs_data dd{idx, docs_ids, terms_ids, ref_size};
-
-        optimize_mu(dd, eps, max_iter);
-//        std::cout << idx.unique_terms() << std::endl;
-
-//        for (auto d_id: docs_ids){
-//            for (auto t_id: terms_ids){
-//                std::cout << idx.term_freq(t_id, d_id) << std::endl;
-//            }
-//        }
-
-//        optimize_mu(std::vector<doc_id> docs_ids,
-//        idx.unique_terms()
-//        idx.total_corpus_terms()
-
-    }
-
-    virtual void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) = 0;
-};
-
-class dirichlet_digamma_rec: public dirichlet_prior_opt{
-    void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override {
-        // fill C_.(n) and C_k(n)
-
+        // calculate C_.(n) and C_k(n)
         std::map<count_d, count_d> docs_counts;
         std::map<term_id, std::map<count_d, count_d>> terms_docs_counts;
-        long doc_size, doc_term_freq;
 
-        cout << "Docs and terms:\n";
-        for (auto d_id: dd.doc_ids){
-            doc_size = dd.idx.doc_size(d_id);
+        long doc_size, doc_term_freq;
+        for (auto d_id: docs_ids){
+            doc_size = idx.doc_size(d_id);
 
             //// increase number of docs with the given size (C_.(n))
             docs_counts[doc_size] += 1;
 
-            cout << d_id << " " << doc_size << " " << docs_counts[doc_size] << endl;
-            for (auto t_id: dd.term_ids){
-                doc_term_freq = dd.idx.term_freq(t_id, d_id);
+            for (auto t_id: terms_ids){
+                doc_term_freq = idx.term_freq(t_id, d_id);
 
                 //// increase number of docs with the given count of word t_id (C_k(n))
                 terms_docs_counts[t_id][doc_term_freq] += 1;
-
-                cout << "    " << t_id << " " << doc_term_freq << " " << terms_docs_counts[t_id][doc_term_freq] << endl;
             }
         }
 
-        cout << "\nDocuments_ids count: " << dd.doc_ids.size() << "; Terms ids count: " << dd.term_ids.size() << endl;
-
-        cout << "\nDocuments sizes frequency:\n";
-        for (auto kv: docs_counts){
-            cout << kv.first << " " << kv.second << endl;
-        }
+        // fill start vector alpha_m
+        std::map<term_id, double> alpha_m;
 
-        int occur_sum, freq_sum;
-        cout << "\nTerms frequency in each doc:\n";
-        for (auto kv: terms_docs_counts){
-            occur_sum = 0;
-            freq_sum = 0;
-            cout << dd.idx.total_num_occurences(kv.first) << " " << kv.first << endl;
-            for (auto kv_: kv.second){
-                occur_sum += kv_.second;
-                freq_sum += kv_.first * kv_.second;
-                cout << "   " << kv_.first << " " << kv_.second << endl;
-            }
-            cout << "    " << freq_sum << " " << occur_sum << " total occurences" << endl;
+        for (auto t_id: terms_ids){
+            alpha_m[t_id] = idx.total_num_occurences(t_id) * default_mu;
+            alpha_m[t_id] /= (double)ref_size;
         }
 
-//        // sort by ascending of occurences
-//        std::sort(docs_counts.begin(), items.end());
-//        for (auto key: terms_docs_counts){
-//            std::sort(key.second.begin(), key.second.end());
-//        }
+        // create docs_data
+        docs_data dd{idx, docs_ids, terms_ids, ref_size, docs_counts, terms_docs_counts, alpha_m};
 
-        // p(w|REF) = dd.idx.total_num_occurences(t_id)
+        // call optimizer
+        optimize_mu(dd, eps, max_iter);
+    }
 
-        // fill start vector alpha_m
-        double alpha = 2000.0, alpha_mk_new;
-        std::map<term_id, double> alpha_m;
+    virtual void optimize_mu(docs_data& dd, float eps, int max_iter) = 0;
+};
 
-        cout << "\nStart alpha: ";
-        for (auto t_id: dd.term_ids){
-            alpha_m[t_id] = dd.idx.total_num_occurences(t_id) * alpha;
-            alpha_m[t_id] /= (double)dd.ref_size;
-            cout << alpha_m[t_id] << " ";
-        }
+class dirichlet_digamma_rec: public dirichlet_prior_opt{
+    void optimize_mu(docs_data& dd, float eps, int max_iter) override {
 
-        double D, S;
         bool all_optimized = false;
         int iter_num = 0;
+        double D, S;
+        double n_max = dd.docs_counts.rbegin()->first;
 
-        double n_max = docs_counts.rbegin()->first;
-        cout << "\n n_max=" << n_max << endl;
+        // start values for alpha and alpha_m
+        double alpha = default_mu, alpha_mk_new;
+        std::map<term_id, double> alpha_m = dd.alpha_m;
 
         while (!all_optimized && iter_num < max_iter){
             D = 0.0;
@@ -248,21 +214,18 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{
 
             alpha = get_alpha(alpha_m);
 
-            cout << "\nIter " << iter_num << " alpha = " << alpha;
             count_d c_d;
             for (count_d n = 1; n <= n_max; n++){
-                c_d = docs_counts[n];
+                c_d = dd.docs_counts[n];
 
                 D += 1.0/(n - 1 + alpha);
                 S += c_d * D;
             }
 
-            cout << "\nD = " << D << "; S = " << S << "; S_k = ";
-
-            std::map<count_d, count_d> c_k;
             term_id k;
+            std::map<count_d, count_d> c_k;
             double S_k;
-            for (auto kv: terms_docs_counts){
+            for (auto kv: dd.terms_docs_counts){
                 k = kv.first;
                 c_k = kv.second;
 
@@ -270,7 +233,6 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{
                 S_k = 0.0;
 
                 count_d c_k_n, n_k_max = c_k.rbegin()->first;
-                cout << "\n n_k_max=" << n_k_max << endl;
                 for (count_d n = 1; n <= n_k_max; n++){
                     c_k_n = c_k[n];
 
@@ -278,8 +240,6 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{
                     S_k += c_k_n * D;
                 }
 
-                cout << S_k << " ";
-
                 alpha_mk_new = alpha_m[k] * S_k / S;
 
                 if (std::abs(alpha_mk_new - alpha_m[k]) > eps){
@@ -289,16 +249,10 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{
                 alpha_m[k] = alpha_mk_new;
             }
 
-            cout << "\nVector alpha_m after the iter: ";
-            for (auto kv: alpha_m){
-                cout << " " << kv.second;
-            }
-
             iter_num++;
         }
 
         mu_ = get_alpha(alpha_m);
-        cout << endl << mu_ << endl;
     }
 };
 
diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
index bd69cfba7..39f80c198 100644
--- a/src/index/ranker/test_opt/test.cpp
+++ b/src/index/ranker/test_opt/test.cpp
@@ -39,7 +39,7 @@ int main(int argc, char* argv[])
     {
         // Create and make score of optimizer
         index::dirichlet_digamma_rec ranker;
-        std::cout << "\n\n" << ranker.get_optimized_mu(*idx, 1e-6, 100) << std::endl;
+        std::cout << "\n\n" << ranker.get_optimized_mu(*idx, 1e-6, 10000) << std::endl;
     });
 
     std::cout << "Method DR took: " << time.count() / 1000.0

From d726f708b16985a47a2b2e5248f9f77dcd464a30 Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Thu, 30 Nov 2017 19:05:46 +0300
Subject: [PATCH 21/30] [opt] + method2

---
 include/meta/index/ranker/dirichlet_prior.h | 60 ++++++++++++++++++++-
 src/index/ranker/test_opt/test.cpp          | 29 +++++-----
 2 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index ef04fc681..6b99671d8 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -197,7 +197,6 @@ class dirichlet_prior_opt : public dirichlet_prior{
 
 class dirichlet_digamma_rec: public dirichlet_prior_opt{
     void optimize_mu(docs_data& dd, float eps, int max_iter) override {
-
         bool all_optimized = false;
         int iter_num = 0;
         double D, S;
@@ -257,7 +256,64 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{
 };
 
 class dirichlet_log_approx: public dirichlet_prior_opt{
-//    void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
+    void optimize_mu(docs_data& dd, float eps, int max_iter) override {
+        bool all_optimized = false;
+        int iter_num = 0;
+        double S, S_k;
+        double n_max = dd.docs_counts.rbegin()->first;
+
+        // start values for alpha and alpha_m
+        double alpha = default_mu, alpha_mk_new;
+        std::map<term_id, double> alpha_m = dd.alpha_m;
+
+        while (!all_optimized && iter_num < max_iter){
+            S = 0.0;
+            all_optimized = true;
+
+            alpha = get_alpha(alpha_m);
+
+            count_d c_d;
+            // TODO: skip the zero docs counts
+            for (count_d n = 1; n <= n_max; n++){
+                c_d = dd.docs_counts[n];
+
+                if (c_d != 0){
+                    S += c_d * (1.0/alpha + log(n + alpha - 0.5) - log(alpha + 0.5));
+                }
+            }
+
+            term_id k;
+            std::map<count_d, count_d> c_k;
+            for (auto kv: dd.terms_docs_counts){
+                k = kv.first;
+                c_k = kv.second;
+
+                S_k = 0.0;
+
+                count_d c_k_n, n_k_max = c_k.rbegin()->first;
+                // TODO: skip the zero docs counts
+                for (count_d n = 1; n <= n_k_max; n++){
+                    c_k_n = c_k[n];
+
+                    if (c_k_n != 0){
+                        S_k += c_k_n * (1.0/alpha_m[k] + log(n + alpha_m[k] - 0.5) - log(alpha_m[k] + 0.5));
+                    }
+                }
+
+                alpha_mk_new = alpha_m[k] * S_k / S;
+
+                if (std::abs(alpha_mk_new - alpha_m[k]) > eps){
+                    all_optimized = false;
+                }
+
+                alpha_m[k] = alpha_mk_new;
+            }
+
+            iter_num++;
+        }
+
+        mu_ = get_alpha(alpha_m);
+    }
 };
 
 class dirichlet_mackay_peto: public dirichlet_prior_opt{
diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
index 39f80c198..5c5beb624 100644
--- a/src/index/ranker/test_opt/test.cpp
+++ b/src/index/ranker/test_opt/test.cpp
@@ -33,27 +33,30 @@ int main(int argc, char* argv[])
     auto config = cpptoml::parse_file(argv[1]);
     auto idx = index::make_index<index::inverted_index>(*config);
 
+    double eps = 1e-6;
+    int iters = 10000;
+
     // Time how long it takes to create the index. By default, common::time's
     //  unit of measurement is milliseconds.
     auto time = common::time([&]()
     {
         // Create and make score of optimizer
         index::dirichlet_digamma_rec ranker;
-        std::cout << "\n\n" << ranker.get_optimized_mu(*idx, 1e-6, 10000) << std::endl;
+        std::cout << "\n\n" << ranker.get_optimized_mu(*idx, eps, iters) << std::endl;
     });
 
-    std::cout << "Method DR took: " << time.count() / 1000.0
-              << " seconds" << std::endl;
+    std::cout << "Method DR took: " << time.count() / 1.0
+              << " milliseconds" << std::endl;
 
-//    time = common::time([&]()
-//    {
-//        // Create and make score of optimizer
-//        index::log_approx ranker;
-//        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
-//    });
+    time = common::time([&]()
+    {
+        // Create and make score of optimizer
+        index::dirichlet_log_approx ranker;
+        std::cout << ranker.get_optimized_mu(*idx, eps, iters) << std::endl;
+    });
 
-//    std::cout << "Method LA took: " << time.count() / 1000.0
-//              << " seconds" << std::endl;
+    std::cout << "Method LA took: " << time.count() / 1.0
+              << " milliseconds" << std::endl;
 
 //    time = common::time([&]()
 //    {
@@ -62,8 +65,8 @@ int main(int argc, char* argv[])
 //        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
 //    });
 
-    std::cout << "Method MP took: " << time.count() / 1000.0
-              << " seconds" << std::endl;
+    std::cout << "Method MP took: " << time.count() / 1.0
+              << " milliseconds" << std::endl;
 
     return 0;
 }

From 4a6a240e55063abc895453d5c67a30f0b430184a Mon Sep 17 00:00:00 2001
From: M <makkolts@gmail.com>
Date: Thu, 30 Nov 2017 19:10:48 +0300
Subject: [PATCH 22/30] Adding constructors and register for new ranker classes

---
 include/meta/index/ranker/dirichlet_prior.h | 57 ++++++++++++-
 src/index/ranker/dirichlet_prior.cpp        | 90 +++++++++++++++++++++
 2 files changed, 145 insertions(+), 2 deletions(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index 1133ad941..57e14841a 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -111,6 +111,11 @@ struct docs_data
 
 class dirichlet_prior_opt : public dirichlet_prior{
 public:
+
+    dirichlet_prior_opt(float mu) : dirichlet_prior(mu) { }
+
+    dirichlet_prior_opt(std::istream& in) : dirichlet_prior(in) { }
+
     template <class ForwardIterator>
     std::vector<search_result> score(inverted_index& idx, ForwardIterator begin,
                                      ForwardIterator end,
@@ -169,6 +174,22 @@ class dirichlet_prior_opt : public dirichlet_prior{
 };
 
 class dirichlet_digamma_rec: public dirichlet_prior_opt{
+public:
+    const static util::string_view id;
+
+    /**
+     * @param mu
+     */
+    dirichlet_digamma_rec(float mu = default_mu);
+
+    /**
+     * Loads a dirichlet_prior ranker from a stream.
+     * @param in The stream to read from
+     */
+    dirichlet_digamma_rec(std::istream& in);
+
+    void save(std::ostream& out) const override;
+private:
     void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override {
         // fill C_.(n) and C_k(n)
 
@@ -303,11 +324,43 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{
 };
 
 class dirichlet_log_approx: public dirichlet_prior_opt{
-//    void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
+public:
+    const static util::string_view id;
+
+    /**
+     * @param mu
+     */
+    dirichlet_log_approx(float mu = default_mu);
+
+    /**
+     * Loads a dirichlet_prior ranker from a stream.
+     * @param in The stream to read from
+     */
+    dirichlet_log_approx(std::istream& in);
+
+    void save(std::ostream& out) const override;
+private:
+    void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override { mu_ = 0;};
 };
 
 class dirichlet_mackay_peto: public dirichlet_prior_opt{
-//    void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
+public:
+    const static util::string_view id;
+
+    /**
+     * @param mu
+     */
+    dirichlet_mackay_peto(float mu = default_mu);
+
+    /**
+     * Loads a dirichlet_prior ranker from a stream.
+     * @param in The stream to read from
+     */
+    dirichlet_mackay_peto(std::istream& in);
+
+    void save(std::ostream& out) const override;
+private:
+    void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override { mu_ = 0;};
 };
 
 /**
diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp
index 07536afbe..9cf0fed10 100644
--- a/src/index/ranker/dirichlet_prior.cpp
+++ b/src/index/ranker/dirichlet_prior.cpp
@@ -55,5 +55,95 @@ std::unique_ptr<ranker>
         throw ranker_exception{"dirichlet-prior mu must be >= 0"};
     return make_unique<dirichlet_prior>(mu);
 }
+
+const util::string_view dirichlet_digamma_rec::id = "dirichlet-digamma-rec";
+template <>
+std::unique_ptr<ranker>
+    make_ranker<dirichlet_digamma_rec>(const cpptoml::table& config)
+{
+    auto mu = config.get_as<double>("mu").value_or(dirichlet_digamma_rec::default_mu);
+    if (mu < 0)
+        throw ranker_exception{"dirichlet-digamma-rec mu must be >= 0"};
+    return make_unique<dirichlet_digamma_rec>(mu);
+}
+
+dirichlet_digamma_rec::dirichlet_digamma_rec(float mu) : dirichlet_prior_opt(mu)
+{
+    // nothing
+}
+
+dirichlet_digamma_rec::dirichlet_digamma_rec(std::istream& in)
+    : dirichlet_prior_opt(in)
+{
+    // nothing
+}
+
+void dirichlet_digamma_rec::save(std::ostream& out) const
+{
+    io::packed::write(out, id);
+
+    io::packed::write(out, mu_);
+}
+
+const util::string_view dirichlet_log_approx::id = "dirichlet-log-approx";
+template <>
+std::unique_ptr<ranker>
+    make_ranker<dirichlet_log_approx>(const cpptoml::table& config)
+{
+    auto mu = config.get_as<double>("mu").value_or(dirichlet_log_approx::default_mu);
+    if (mu < 0)
+        throw ranker_exception{"dirichlet-log-approx mu must be >= 0"};
+    return make_unique<dirichlet_log_approx>(mu);
+}
+
+
+dirichlet_log_approx::dirichlet_log_approx(float mu) : dirichlet_prior_opt(mu)
+{
+    // nothing
+}
+
+dirichlet_log_approx::dirichlet_log_approx(std::istream& in)
+    : dirichlet_prior_opt(in)
+{
+    // nothing
+}
+
+void dirichlet_log_approx::save(std::ostream& out) const
+{
+    io::packed::write(out, id);
+
+    io::packed::write(out, mu_);
+}
+
+const util::string_view dirichlet_mackay_peto::id = "dirichlet-mackay-peto";
+template <>
+std::unique_ptr<ranker>
+    make_ranker<dirichlet_mackay_peto>(const cpptoml::table& config)
+{
+    auto mu = config.get_as<double>("mu").value_or(dirichlet_mackay_peto::default_mu);
+    if (mu < 0)
+        throw ranker_exception{"dirichlet-mackay-peto mu must be >= 0"};
+    return make_unique<dirichlet_mackay_peto>(mu);
+}
+
+
+dirichlet_mackay_peto::dirichlet_mackay_peto(float mu) : dirichlet_prior_opt(mu)
+{
+    // nothing
+}
+
+dirichlet_mackay_peto::dirichlet_mackay_peto(std::istream& in)
+    : dirichlet_prior_opt(in)
+{
+    // nothing
+}
+
+void dirichlet_mackay_peto::save(std::ostream& out) const
+{
+    io::packed::write(out, id);
+
+    io::packed::write(out, mu_);
+}
+
 }
 }

From bc948ce7133d9a06738e7699b342506d2119e4c3 Mon Sep 17 00:00:00 2001
From: M <makkolts@gmail.com>
Date: Thu, 30 Nov 2017 20:51:37 +0300
Subject: [PATCH 23/30] Add rankers to factory

---
 include/meta/index/ranker/dirichlet_prior.h | 21 +++++++++++++++++++++
 src/index/ranker/ranker_factory.cpp         |  3 +++
 2 files changed, 24 insertions(+)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index bd6dac611..2f0e33901 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -384,6 +384,27 @@ class dirichlet_mackay_peto: public dirichlet_prior_opt{
  */
 template <>
 std::unique_ptr<ranker> make_ranker<dirichlet_prior>(const cpptoml::table&);
+
+/**
+ * Specialization of the factory method used to create dirichlet_digamma_rec
+ * rankers.
+ */
+template <>
+std::unique_ptr<ranker> make_ranker<dirichlet_digamma_rec>(const cpptoml::table&);
+
+/**
+ * Specialization of the factory method used to create dirichlet_log_approx
+ * rankers.
+ */
+template <>
+std::unique_ptr<ranker> make_ranker<dirichlet_log_approx>(const cpptoml::table&);
+
+/**
+ * Specialization of the factory method used to create dirichlet_mackay_peto
+ * rankers.
+ */
+template <>
+std::unique_ptr<ranker> make_ranker<dirichlet_mackay_peto>(const cpptoml::table&);
 }
 }
 #endif
diff --git a/src/index/ranker/ranker_factory.cpp b/src/index/ranker/ranker_factory.cpp
index 86c1069af..0643b0742 100644
--- a/src/index/ranker/ranker_factory.cpp
+++ b/src/index/ranker/ranker_factory.cpp
@@ -31,6 +31,9 @@ ranker_factory::ranker_factory()
     reg<pivoted_length>();
     reg<kl_divergence_prf>();
     reg<rocchio>();
+    reg<dirichlet_digamma_rec>();
+    reg<dirichlet_log_approx>();
+    reg<dirichlet_mackay_peto>();
 }
 
 std::unique_ptr<ranker> make_ranker(const cpptoml::table& config)

From 78d6d5c8ae3f4170a152524438a5b5fad6ff99bf Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Thu, 30 Nov 2017 20:58:58 +0300
Subject: [PATCH 24/30] [opt] + benchmark

---
 include/meta/index/ranker/dirichlet_prior.h | 30 ++++++++++------
 src/index/ranker/test_opt/test.cpp          | 39 ++++++++++++---------
 2 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index bd6dac611..8fdd52672 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -68,6 +68,10 @@ class dirichlet_prior : public language_model_ranker
      */
     float doc_constant(const score_data& sd) const override;
 
+    float parameter() const {
+        return mu_;
+    }
+
   protected:
     /// the Dirichlet prior parameter
 //    const float mu_;
@@ -135,10 +139,8 @@ class dirichlet_prior_opt : public dirichlet_prior{
         return ranker::score(idx, begin, end, num_results);
     }
 
-    float get_optimized_mu(const inverted_index& idx, float eps, int max_iter) {
-        optimize_mu(idx, eps, max_iter);
-
-        return mu_;
+    std::map<term_id, double> get_optimized_mu(const inverted_index& idx, float eps, int max_iter) {
+        return optimize_mu(idx, eps, max_iter);
     }
 
 protected:
@@ -153,7 +155,7 @@ class dirichlet_prior_opt : public dirichlet_prior{
     }
 
 private:
-    void optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) {
+    std::map<term_id, double> optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) {
         // parse idx and extract what we need
         auto docs_ids = idx.docs();
         auto terms_ids = idx.terms();
@@ -194,10 +196,10 @@ class dirichlet_prior_opt : public dirichlet_prior{
         docs_data dd{idx, docs_ids, terms_ids, ref_size, docs_counts, terms_docs_counts, alpha_m};
 
         // call optimizer
-        optimize_mu(dd, eps, max_iter);
+        return optimize_mu(dd, eps, max_iter);
     }
 
-    virtual void optimize_mu(docs_data& dd, float eps, int max_iter) = 0;
+    virtual std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) = 0;
 };
 
 class dirichlet_digamma_rec: public dirichlet_prior_opt{
@@ -217,7 +219,7 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{
 
     void save(std::ostream& out) const override;
 private:
-    void optimize_mu(docs_data& dd, float eps, int max_iter) override {
+    std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override {
         bool all_optimized = false;
         int iter_num = 0;
         double D, S;
@@ -273,7 +275,10 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{
         }
 
         mu_ = get_alpha(alpha_m);
+
+        return alpha_m;
     }
+
 };
 
 class dirichlet_log_approx: public dirichlet_prior_opt{
@@ -293,7 +298,7 @@ class dirichlet_log_approx: public dirichlet_prior_opt{
 
     void save(std::ostream& out) const override;
 private:
-    void optimize_mu(docs_data& dd, float eps, int max_iter) override {
+    std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override {
         bool all_optimized = false;
         int iter_num = 0;
         double S, S_k;
@@ -350,6 +355,8 @@ class dirichlet_log_approx: public dirichlet_prior_opt{
         }
 
         mu_ = get_alpha(alpha_m);
+
+        return alpha_m;
     }
 };
 
@@ -370,11 +377,14 @@ class dirichlet_mackay_peto: public dirichlet_prior_opt{
 
     void save(std::ostream& out) const override;
 private:
-    void optimize_mu(docs_data& dd, float eps, int max_iter) override {
+    std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override {
         eps = eps;
         max_iter = max_iter;
         eps = dd.ref_size;
         mu_ = 0;
+        std::map<term_id, double> alpha_m;
+
+        return alpha_m;
     }
 };
 
diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
index 5c5beb624..054952104 100644
--- a/src/index/ranker/test_opt/test.cpp
+++ b/src/index/ranker/test_opt/test.cpp
@@ -13,6 +13,13 @@
 using namespace meta;
 
 
+void display_result(float alpha, std::map<term_id, double> alpha_m, float time){
+    for (auto kv: alpha_m){
+        std::cout << kv.second << " ";
+    }
+    std::cout << alpha << std::endl << time << std::endl;
+}
+
 int main(int argc, char* argv[])
 {
     if (argc != 2)
@@ -36,27 +43,28 @@ int main(int argc, char* argv[])
     double eps = 1e-6;
     int iters = 10000;
 
-    // Time how long it takes to create the index. By default, common::time's
-    //  unit of measurement is milliseconds.
-    auto time = common::time([&]()
+    float alpha;
+    std::map<term_id, double> alpha_m;
+
+    index::dirichlet_digamma_rec ranker1;
+    index::dirichlet_log_approx ranker2;
+
+    auto time1 = common::time([&]()
     {
-        // Create and make score of optimizer
-        index::dirichlet_digamma_rec ranker;
-        std::cout << "\n\n" << ranker.get_optimized_mu(*idx, eps, iters) << std::endl;
+        alpha_m = ranker1.get_optimized_mu(*idx, eps, iters);
+        alpha = ranker1.parameter();
     });
 
-    std::cout << "Method DR took: " << time.count() / 1.0
-              << " milliseconds" << std::endl;
+    display_result(alpha, alpha_m, time1.count() / 1.0);
 
-    time = common::time([&]()
+    auto time2 = common::time([&]()
     {
-        // Create and make score of optimizer
-        index::dirichlet_log_approx ranker;
-        std::cout << ranker.get_optimized_mu(*idx, eps, iters) << std::endl;
+        alpha_m = ranker2.get_optimized_mu(*idx, eps, iters);
+        alpha = ranker2.parameter();
     });
 
-    std::cout << "Method LA took: " << time.count() / 1.0
-              << " milliseconds" << std::endl;
+    display_result(alpha, alpha_m, time2.count() / 1.0);
+
 
 //    time = common::time([&]()
 //    {
@@ -65,8 +73,5 @@ int main(int argc, char* argv[])
 //        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
 //    });
 
-    std::cout << "Method MP took: " << time.count() / 1.0
-              << " milliseconds" << std::endl;
-
     return 0;
 }

From 5bc6ee68e162ff4a4aae626a2fb3d9c5932b9c80 Mon Sep 17 00:00:00 2001
From: M <makkolts@gmail.com>
Date: Thu, 30 Nov 2017 21:37:49 +0300
Subject: [PATCH 25/30] Minor fix foor output

---
 src/index/ranker/test_opt/test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
index 054952104..2c27f2427 100644
--- a/src/index/ranker/test_opt/test.cpp
+++ b/src/index/ranker/test_opt/test.cpp
@@ -17,7 +17,7 @@ void display_result(float alpha, std::map<term_id, double> alpha_m, float time){
     for (auto kv: alpha_m){
         std::cout << kv.second << " ";
     }
-    std::cout << alpha << std::endl << time << std::endl;
+    std::cout << std::endl << alpha << std::endl << time << std::endl;
 }
 
 int main(int argc, char* argv[])

From 4f8fa1d59e3dc9f862a491a08b744ed9a789fd8b Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Fri, 1 Dec 2017 02:21:48 +0300
Subject: [PATCH 26/30] [opt] + dirichlet_opt files

---
 include/meta/index/ranker/all.h             |   1 +
 include/meta/index/ranker/dirichlet_prior.h | 331 --------------------
 src/index/ranker/CMakeLists.txt             |   1 +
 src/index/ranker/dirichlet_prior.cpp        |  89 ------
 4 files changed, 2 insertions(+), 420 deletions(-)

diff --git a/include/meta/index/ranker/all.h b/include/meta/index/ranker/all.h
index 8a1fe0e04..3b3c1efcf 100644
--- a/include/meta/index/ranker/all.h
+++ b/include/meta/index/ranker/all.h
@@ -1,6 +1,7 @@
 #include "meta/index/ranker/ranker.h"
 #include "meta/index/ranker/absolute_discount.h"
 #include "meta/index/ranker/dirichlet_prior.h"
+#include "meta/index/ranker/dirichlet_prior_opt.h"
 #include "meta/index/ranker/jelinek_mercer.h"
 #include "meta/index/ranker/lm_ranker.h"
 #include "meta/index/ranker/okapi_bm25.h"
diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h
index cfe782b1a..6f2456f8a 100644
--- a/include/meta/index/ranker/dirichlet_prior.h
+++ b/include/meta/index/ranker/dirichlet_prior.h
@@ -12,8 +12,6 @@
 #include "meta/index/ranker/lm_ranker.h"
 #include "meta/index/ranker/ranker_factory.h"
 
-#include <cmath>
-
 namespace meta
 {
 namespace index
@@ -79,315 +77,6 @@ class dirichlet_prior : public language_model_ranker
 };
 
 
-// # TODO: choose template type instead of long
-typedef long count_d;
-
-struct docs_data
-{
-    // general info
-
-    const inverted_index& idx;
-    /// ids of all documents
-    std::vector<doc_id> doc_ids;
-    /// ids of all terms
-    std::vector<term_id> term_ids;
-    /// total size of documents
-    count_d ref_size;
-    /// C_.(n)
-    std::map<count_d, count_d> docs_counts;
-    /// C_k(n)
-    std::map<term_id, std::map<count_d, count_d>> terms_docs_counts;
-    /// vector alpha_m
-    std::map<term_id, double> alpha_m;
-
-    /**
-     * Constructor to initialize most elements.
-     * @param p_idx The index that is being used
-     * @param p_doc_ids ids of all docs
-     * @param p_term_ids ids of all terms
-     */
-    docs_data(const inverted_index& p_idx, std::vector<doc_id> p_doc_ids, std::vector<term_id> p_term_ids, count_d p_ref_size,
-              std::map<count_d, count_d> p_docs_counts, std::map<term_id, std::map<count_d, count_d>> p_terms_docs_counts,
-              std::map<term_id, double> p_alpha_m)
-        : idx(p_idx), // gcc no non-const ref init from brace init list
-          doc_ids{p_doc_ids},
-          term_ids{p_term_ids},
-          ref_size{p_ref_size},
-          docs_counts{p_docs_counts},
-          terms_docs_counts{p_terms_docs_counts},
-          alpha_m{p_alpha_m}
-    {
-        /* nothing */
-    }
-};
-
-class dirichlet_prior_opt : public dirichlet_prior{
-public:
-
-    dirichlet_prior_opt(float mu) : dirichlet_prior(mu) { }
-
-    dirichlet_prior_opt(std::istream& in) : dirichlet_prior(in) { }
-
-    template <class ForwardIterator>
-    std::vector<search_result> score(inverted_index& idx, ForwardIterator begin,
-                                     ForwardIterator end,
-                                     uint64_t num_results = 10)
-    {
-        // optimize mu according to ranker_context before ranking
-        this->optimize_mu(idx);
-
-        return ranker::score(idx, begin, end, num_results);
-    }
-
-    std::map<term_id, double> get_optimized_mu(const inverted_index& idx, float eps, int max_iter) {
-        return optimize_mu(idx, eps, max_iter);
-    }
-
-protected:
-    inline double get_alpha(std::map<term_id, double> alpha_m){
-        double alpha = 0;
-
-        for (auto alpha_m_k: alpha_m){
-            alpha += alpha_m_k.second;
-        }
-
-        return alpha;
-    }
-
-private:
-    std::map<term_id, double> optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) {
-        // parse idx and extract what we need
-        auto docs_ids = idx.docs();
-        auto terms_ids = idx.terms();
-
-        // calculate ref_size
-        count_d ref_size = 0;
-        for (auto& id : docs_ids)
-            ref_size += idx.doc_size(id);
-
-        // calculate C_.(n) and C_k(n)
-        std::map<count_d, count_d> docs_counts;
-        std::map<term_id, std::map<count_d, count_d>> terms_docs_counts;
-
-        long doc_size, doc_term_freq;
-        for (auto d_id: docs_ids){
-            doc_size = idx.doc_size(d_id);
-
-            //// increase number of docs with the given size (C_.(n))
-            docs_counts[doc_size] += 1;
-
-            for (auto t_id: terms_ids){
-                doc_term_freq = idx.term_freq(t_id, d_id);
-
-                //// increase number of docs with the given count of word t_id (C_k(n))
-                terms_docs_counts[t_id][doc_term_freq] += 1;
-            }
-        }
-
-        // fill start vector alpha_m
-        std::map<term_id, double> alpha_m;
-
-        for (auto t_id: terms_ids){
-            alpha_m[t_id] = idx.total_num_occurences(t_id) * default_mu;
-            alpha_m[t_id] /= (double)ref_size;
-        }
-
-        // create docs_data
-        docs_data dd{idx, docs_ids, terms_ids, ref_size, docs_counts, terms_docs_counts, alpha_m};
-
-        // call optimizer
-        return optimize_mu(dd, eps, max_iter);
-    }
-
-    virtual std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) = 0;
-};
-
-class dirichlet_digamma_rec: public dirichlet_prior_opt{
-public:
-    const static util::string_view id;
-
-    /**
-     * @param mu
-     */
-    dirichlet_digamma_rec(float mu = default_mu);
-
-    /**
-     * Loads a dirichlet_prior ranker from a stream.
-     * @param in The stream to read from
-     */
-    dirichlet_digamma_rec(std::istream& in);
-
-    void save(std::ostream& out) const override;
-private:
-    std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override {
-        bool all_optimized = false;
-        int iter_num = 0;
-        double D, S;
-        double n_max = dd.docs_counts.rbegin()->first;
-
-        // start values for alpha and alpha_m
-        double alpha = default_mu, alpha_mk_new;
-        std::map<term_id, double> alpha_m = dd.alpha_m;
-
-        while (!all_optimized && iter_num < max_iter){
-            D = 0.0;
-            S = 0.0;
-            all_optimized = true;
-
-            alpha = get_alpha(alpha_m);
-
-            count_d c_d;
-            for (count_d n = 1; n <= n_max; n++){
-                c_d = dd.docs_counts[n];
-
-                D += 1.0/(n - 1 + alpha);
-                S += c_d * D;
-            }
-
-            term_id k;
-            std::map<count_d, count_d> c_k;
-            double S_k;
-            for (auto kv: dd.terms_docs_counts){
-                k = kv.first;
-                c_k = kv.second;
-
-                D = 0.0;
-                S_k = 0.0;
-
-                count_d c_k_n, n_k_max = c_k.rbegin()->first;
-                for (count_d n = 1; n <= n_k_max; n++){
-                    c_k_n = c_k[n];
-
-                    D += 1.0/(n - 1 + alpha_m[k]);
-                    S_k += c_k_n * D;
-                }
-
-                alpha_mk_new = alpha_m[k] * S_k / S;
-
-                if (std::abs(alpha_mk_new - alpha_m[k]) > eps){
-                    all_optimized = false;
-                }
-
-                alpha_m[k] = alpha_mk_new;
-            }
-
-            iter_num++;
-        }
-
-        mu_ = get_alpha(alpha_m);
-
-        return alpha_m;
-    }
-
-};
-
-class dirichlet_log_approx: public dirichlet_prior_opt{
-public:
-    const static util::string_view id;
-
-    /**
-     * @param mu
-     */
-    dirichlet_log_approx(float mu = default_mu);
-
-    /**
-     * Loads a dirichlet_prior ranker from a stream.
-     * @param in The stream to read from
-     */
-    dirichlet_log_approx(std::istream& in);
-
-    void save(std::ostream& out) const override;
-private:
-    std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override {
-        bool all_optimized = false;
-        int iter_num = 0;
-        double S, S_k;
-        double n_max = dd.docs_counts.rbegin()->first;
-
-        // start values for alpha and alpha_m
-        double alpha = default_mu, alpha_mk_new;
-        std::map<term_id, double> alpha_m = dd.alpha_m;
-
-        while (!all_optimized && iter_num < max_iter){
-            S = 0.0;
-            all_optimized = true;
-
-            alpha = get_alpha(alpha_m);
-
-            count_d c_d;
-            // TODO: skip the zero docs counts
-            for (count_d n = 1; n <= n_max; n++){
-                c_d = dd.docs_counts[n];
-
-                if (c_d != 0){
-                    S += c_d * (1.0/alpha + log(n + alpha - 0.5) - log(alpha + 0.5));
-                }
-            }
-
-            term_id k;
-            std::map<count_d, count_d> c_k;
-            for (auto kv: dd.terms_docs_counts){
-                k = kv.first;
-                c_k = kv.second;
-
-                S_k = 0.0;
-
-                count_d c_k_n, n_k_max = c_k.rbegin()->first;
-                // TODO: skip the zero docs counts
-                for (count_d n = 1; n <= n_k_max; n++){
-                    c_k_n = c_k[n];
-
-                    if (c_k_n != 0){
-                        S_k += c_k_n * (1.0/alpha_m[k] + log(n + alpha_m[k] - 0.5) - log(alpha_m[k] + 0.5));
-                    }
-                }
-
-                alpha_mk_new = alpha_m[k] * S_k / S;
-
-                if (std::abs(alpha_mk_new - alpha_m[k]) > eps){
-                    all_optimized = false;
-                }
-
-                alpha_m[k] = alpha_mk_new;
-            }
-
-            iter_num++;
-        }
-
-        mu_ = get_alpha(alpha_m);
-
-        return alpha_m;
-    }
-};
-
-class dirichlet_mackay_peto: public dirichlet_prior_opt{
-public:
-    const static util::string_view id;
-
-    /**
-     * @param mu
-     */
-    dirichlet_mackay_peto(float mu = default_mu);
-
-    /**
-     * Loads a dirichlet_prior ranker from a stream.
-     * @param in The stream to read from
-     */
-    dirichlet_mackay_peto(std::istream& in);
-
-    void save(std::ostream& out) const override;
-private:
-    std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override {
-        eps = eps;
-        max_iter = max_iter;
-        eps = dd.ref_size;
-        mu_ = 0;
-        std::map<term_id, double> alpha_m;
-
-        return alpha_m;
-    }
-};
-
 /**
  * Specialization of the factory method used to create dirichlet_prior
  * rankers.
@@ -395,26 +84,6 @@ class dirichlet_mackay_peto: public dirichlet_prior_opt{
 template <>
 std::unique_ptr<ranker> make_ranker<dirichlet_prior>(const cpptoml::table&);
 
-/**
- * Specialization of the factory method used to create dirichlet_digamma_rec
- * rankers.
- */
-template <>
-std::unique_ptr<ranker> make_ranker<dirichlet_digamma_rec>(const cpptoml::table&);
-
-/**
- * Specialization of the factory method used to create dirichlet_log_approx
- * rankers.
- */
-template <>
-std::unique_ptr<ranker> make_ranker<dirichlet_log_approx>(const cpptoml::table&);
-
-/**
- * Specialization of the factory method used to create dirichlet_mackay_peto
- * rankers.
- */
-template <>
-std::unique_ptr<ranker> make_ranker<dirichlet_mackay_peto>(const cpptoml::table&);
 }
 }
 #endif
diff --git a/src/index/ranker/CMakeLists.txt b/src/index/ranker/CMakeLists.txt
index e386d54b6..f0e699418 100644
--- a/src/index/ranker/CMakeLists.txt
+++ b/src/index/ranker/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(test_opt)
 
 add_library(meta-ranker absolute_discount.cpp
                         dirichlet_prior.cpp
+						dirichlet_prior_opt.cpp
                         jelinek_mercer.cpp
                         lm_ranker.cpp
                         okapi_bm25.cpp
diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp
index 9cf0fed10..3a43297d6 100644
--- a/src/index/ranker/dirichlet_prior.cpp
+++ b/src/index/ranker/dirichlet_prior.cpp
@@ -56,94 +56,5 @@ std::unique_ptr<ranker>
     return make_unique<dirichlet_prior>(mu);
 }
 
-const util::string_view dirichlet_digamma_rec::id = "dirichlet-digamma-rec";
-template <>
-std::unique_ptr<ranker>
-    make_ranker<dirichlet_digamma_rec>(const cpptoml::table& config)
-{
-    auto mu = config.get_as<double>("mu").value_or(dirichlet_digamma_rec::default_mu);
-    if (mu < 0)
-        throw ranker_exception{"dirichlet-digamma-rec mu must be >= 0"};
-    return make_unique<dirichlet_digamma_rec>(mu);
-}
-
-dirichlet_digamma_rec::dirichlet_digamma_rec(float mu) : dirichlet_prior_opt(mu)
-{
-    // nothing
-}
-
-dirichlet_digamma_rec::dirichlet_digamma_rec(std::istream& in)
-    : dirichlet_prior_opt(in)
-{
-    // nothing
-}
-
-void dirichlet_digamma_rec::save(std::ostream& out) const
-{
-    io::packed::write(out, id);
-
-    io::packed::write(out, mu_);
-}
-
-const util::string_view dirichlet_log_approx::id = "dirichlet-log-approx";
-template <>
-std::unique_ptr<ranker>
-    make_ranker<dirichlet_log_approx>(const cpptoml::table& config)
-{
-    auto mu = config.get_as<double>("mu").value_or(dirichlet_log_approx::default_mu);
-    if (mu < 0)
-        throw ranker_exception{"dirichlet-log-approx mu must be >= 0"};
-    return make_unique<dirichlet_log_approx>(mu);
-}
-
-
-dirichlet_log_approx::dirichlet_log_approx(float mu) : dirichlet_prior_opt(mu)
-{
-    // nothing
-}
-
-dirichlet_log_approx::dirichlet_log_approx(std::istream& in)
-    : dirichlet_prior_opt(in)
-{
-    // nothing
-}
-
-void dirichlet_log_approx::save(std::ostream& out) const
-{
-    io::packed::write(out, id);
-
-    io::packed::write(out, mu_);
-}
-
-const util::string_view dirichlet_mackay_peto::id = "dirichlet-mackay-peto";
-template <>
-std::unique_ptr<ranker>
-    make_ranker<dirichlet_mackay_peto>(const cpptoml::table& config)
-{
-    auto mu = config.get_as<double>("mu").value_or(dirichlet_mackay_peto::default_mu);
-    if (mu < 0)
-        throw ranker_exception{"dirichlet-mackay-peto mu must be >= 0"};
-    return make_unique<dirichlet_mackay_peto>(mu);
-}
-
-
-dirichlet_mackay_peto::dirichlet_mackay_peto(float mu) : dirichlet_prior_opt(mu)
-{
-    // nothing
-}
-
-dirichlet_mackay_peto::dirichlet_mackay_peto(std::istream& in)
-    : dirichlet_prior_opt(in)
-{
-    // nothing
-}
-
-void dirichlet_mackay_peto::save(std::ostream& out) const
-{
-    io::packed::write(out, id);
-
-    io::packed::write(out, mu_);
-}
-
 }
 }

From c8ddfbfdd9eb620e35b2e21d665289b33d19b828 Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Fri, 1 Dec 2017 02:24:45 +0300
Subject: [PATCH 27/30] [opt] + dirichlet_prior_opt

---
 .../meta/index/ranker/dirichlet_prior_opt.h   | 241 +++++++++++++++++
 src/index/ranker/dirichlet_prior_opt.cpp      | 242 ++++++++++++++++++
 2 files changed, 483 insertions(+)
 create mode 100644 include/meta/index/ranker/dirichlet_prior_opt.h
 create mode 100644 src/index/ranker/dirichlet_prior_opt.cpp

diff --git a/include/meta/index/ranker/dirichlet_prior_opt.h b/include/meta/index/ranker/dirichlet_prior_opt.h
new file mode 100644
index 000000000..face18029
--- /dev/null
+++ b/include/meta/index/ranker/dirichlet_prior_opt.h
@@ -0,0 +1,241 @@
+/**
+ * @file dirichlet_prior_opt.h
+ * @author Aleksey Marashov, Kolomiets Maxim
+ *
+ * All files in META are released under the MIT license. For more details,
+ * consult the file LICENSE in the root of the project.
+ */
+
+#ifndef META_DIRICHLET_PRIOR_OPT_H_
+#define META_DIRICHLET_PRIOR_OPT_H_
+
+#include "meta/index/ranker/dirichlet_prior.h"
+
+#include <cmath>
+
+namespace meta
+{
+namespace index
+{
+
+// # TODO: choose template type instead of long
+typedef long count_d;
+
+struct docs_data
+{
+    // general info
+
+    const inverted_index& idx;
+    /// ids of all documents
+    std::vector<doc_id> doc_ids;
+    /// ids of all terms
+    std::vector<term_id> term_ids;
+    /// total size of documents
+    count_d ref_size;
+    /// C_.(n)
+    std::map<count_d, count_d> docs_counts;
+    /// C_k(n)
+    std::map<term_id, std::map<count_d, count_d>> terms_docs_counts;
+    /// vector alpha_m
+    std::map<term_id, double> alpha_m;
+
+    /**
+     * Constructor to initialize most elements.
+     * @param p_idx The index that is being used
+     * @param p_doc_ids ids of all docs
+     * @param p_term_ids ids of all terms
+     */
+    docs_data(const inverted_index& p_idx, std::vector<doc_id> p_doc_ids, std::vector<term_id> p_term_ids, count_d p_ref_size,
+              std::map<count_d, count_d> p_docs_counts, std::map<term_id, std::map<count_d, count_d>> p_terms_docs_counts,
+              std::map<term_id, double> p_alpha_m)
+        : idx(p_idx), // gcc no non-const ref init from brace init list
+          doc_ids{p_doc_ids},
+          term_ids{p_term_ids},
+          ref_size{p_ref_size},
+          docs_counts{p_docs_counts},
+          terms_docs_counts{p_terms_docs_counts},
+          alpha_m{p_alpha_m}
+    {
+        /* nothing */
+    }
+};
+
+
+/**
+ * Implements Bayesian smoothing with a Dirichlet prior.
+ *
+ * Required config parameters:
+ * ~~~toml
+ * [ranker]
+ * method = "dirichlet-prior"
+ * ~~~
+ *
+ * Optional config parameters:
+ * ~~~toml
+ * mu = 2000.0
+ * ~~~
+ */
+class dirichlet_prior_opt : public dirichlet_prior{
+public:
+    dirichlet_prior_opt(float mu) : dirichlet_prior(mu) { }
+
+    dirichlet_prior_opt(std::istream& in) : dirichlet_prior(in) { }
+
+    template <class ForwardIterator>
+    std::vector<search_result> score(inverted_index& idx, ForwardIterator begin,
+                                     ForwardIterator end,
+                                     uint64_t num_results = 10)
+    {
+        this->optimize_mu(idx);
+
+        return ranker::score(idx, begin, end, num_results);
+    }
+
+    std::map<term_id, double> get_optimized_mu(const inverted_index& idx, float eps, int max_iter) {
+        return optimize_mu(idx, eps, max_iter);
+    }
+
+protected:
+    inline double get_alpha(std::map<term_id, double> alpha_m){
+        double alpha = 0;
+
+        for (auto alpha_m_k: alpha_m){
+            alpha += alpha_m_k.second;
+        }
+
+        return alpha;
+    }
+
+private:
+    std::map<term_id, double> optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) {
+        // parse idx and extract what we need
+
+        auto docs_ids = idx.docs();
+        auto terms_ids = idx.terms();
+
+        // calculate ref_size
+        count_d ref_size = 0;
+        for (auto& id : docs_ids)
+            ref_size += idx.doc_size(id);
+
+        // calculate C_.(n) and C_k(n)
+        std::map<count_d, count_d> docs_counts;
+        std::map<term_id, std::map<count_d, count_d>> terms_docs_counts;
+
+        long doc_size, doc_term_freq;
+        for (auto d_id: docs_ids){
+            doc_size = idx.doc_size(d_id);
+
+            //// increase number of docs with the given size (C_.(n))
+            docs_counts[doc_size] += 1;
+
+            for (auto t_id: terms_ids){
+                doc_term_freq = idx.term_freq(t_id, d_id);
+
+                //// increase number of docs with the given count of word t_id (C_k(n))
+                terms_docs_counts[t_id][doc_term_freq] += 1;
+            }
+        }
+
+        // fill start vector alpha_m
+        std::map<term_id, double> alpha_m;
+
+        for (auto t_id: terms_ids){
+            alpha_m[t_id] = idx.total_num_occurences(t_id) * default_mu;
+            alpha_m[t_id] /= (double)ref_size;
+        }
+
+        // create docs_data
+        docs_data dd{idx, docs_ids, terms_ids, ref_size, docs_counts, terms_docs_counts, alpha_m};
+
+        // call optimizer
+        return optimize_mu(dd, eps, max_iter);
+    }
+
+    virtual std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) = 0;
+};
+
+class dirichlet_digamma_rec: public dirichlet_prior_opt{
+public:
+    const static util::string_view id;
+
+    /**
+     * @param mu
+     */
+    dirichlet_digamma_rec(float mu = default_mu);
+
+    /**
+     * Loads a dirichlet_prior ranker from a stream.
+     * @param in The stream to read from
+     */
+    dirichlet_digamma_rec(std::istream& in);
+
+    void save(std::ostream& out) const override;
+private:
+    std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override;
+
+};
+
+class dirichlet_log_approx: public dirichlet_prior_opt{
+public:
+    const static util::string_view id;
+
+    /**
+     * @param mu
+     */
+    dirichlet_log_approx(float mu = default_mu);
+
+    /**
+     * Loads a dirichlet_prior ranker from a stream.
+     * @param in The stream to read from
+     */
+    dirichlet_log_approx(std::istream& in);
+
+    void save(std::ostream& out) const override;
+private:
+    std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override;
+};
+
+class dirichlet_mackay_peto: public dirichlet_prior_opt{
+public:
+    const static util::string_view id;
+
+    /**
+     * @param mu
+     */
+    dirichlet_mackay_peto(float mu = default_mu);
+
+    /**
+     * Loads a dirichlet_prior ranker from a stream.
+     * @param in The stream to read from
+     */
+    dirichlet_mackay_peto(std::istream& in);
+
+    void save(std::ostream& out) const override;
+private:
+    std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override;
+};
+
+/**
+ * Specialization of the factory method used to create dirichlet_digamma_rec
+ * rankers.
+ */
+template <>
+std::unique_ptr<ranker> make_ranker<dirichlet_digamma_rec>(const cpptoml::table&);
+
+/**
+ * Specialization of the factory method used to create dirichlet_log_approx
+ * rankers.
+ */
+template <>
+std::unique_ptr<ranker> make_ranker<dirichlet_log_approx>(const cpptoml::table&);
+
+/**
+ * Specialization of the factory method used to create dirichlet_mackay_peto
+ * rankers.
+ */
+template <>
+std::unique_ptr<ranker> make_ranker<dirichlet_mackay_peto>(const cpptoml::table&);
+}
+}
+#endif
diff --git a/src/index/ranker/dirichlet_prior_opt.cpp b/src/index/ranker/dirichlet_prior_opt.cpp
new file mode 100644
index 000000000..2a2df1ce5
--- /dev/null
+++ b/src/index/ranker/dirichlet_prior_opt.cpp
@@ -0,0 +1,242 @@
+/**
+ * @file dirichlet_prior_opt.cpp
+ * @author Aleksey Marashov, Kolomiets Maksim
+ */
+
+#include "cpptoml.h"
+#include "meta/index/ranker/dirichlet_prior_opt.h"
+#include "meta/index/score_data.h"
+
+namespace meta
+{
+namespace index
+{
+
+// makers
+
+const util::string_view dirichlet_digamma_rec::id = "dirichlet-digamma-rec";
+template <>
+std::unique_ptr<ranker>
+    make_ranker<dirichlet_digamma_rec>(const cpptoml::table& config)
+{
+    auto mu = config.get_as<double>("mu").value_or(dirichlet_digamma_rec::default_mu);
+    if (mu < 0)
+        throw ranker_exception{"dirichlet-digamma-rec mu must be >= 0"};
+    return make_unique<dirichlet_digamma_rec>(mu);
+}
+
+const util::string_view dirichlet_log_approx::id = "dirichlet-log-approx";
+template <>
+std::unique_ptr<ranker>
+    make_ranker<dirichlet_log_approx>(const cpptoml::table& config)
+{
+    auto mu = config.get_as<double>("mu").value_or(dirichlet_log_approx::default_mu);
+    if (mu < 0)
+        throw ranker_exception{"dirichlet-log-approx mu must be >= 0"};
+    return make_unique<dirichlet_log_approx>(mu);
+}
+
+const util::string_view dirichlet_mackay_peto::id = "dirichlet-mackay-peto";
+template <>
+std::unique_ptr<ranker>
+    make_ranker<dirichlet_mackay_peto>(const cpptoml::table& config)
+{
+    auto mu = config.get_as<double>("mu").value_or(dirichlet_mackay_peto::default_mu);
+    if (mu < 0)
+        throw ranker_exception{"dirichlet-mackay-peto mu must be >= 0"};
+    return make_unique<dirichlet_mackay_peto>(mu);
+}
+
+// constructors
+
+dirichlet_digamma_rec::dirichlet_digamma_rec(float mu) : dirichlet_prior_opt(mu)
+{
+    // nothing
+}
+
+dirichlet_digamma_rec::dirichlet_digamma_rec(std::istream& in)
+    : dirichlet_prior_opt(in)
+{
+    // nothing
+}
+
+void dirichlet_digamma_rec::save(std::ostream& out) const
+{
+    io::packed::write(out, id);
+
+    io::packed::write(out, mu_);
+}
+
+
+dirichlet_log_approx::dirichlet_log_approx(float mu) : dirichlet_prior_opt(mu)
+{
+    // nothing
+}
+
+dirichlet_log_approx::dirichlet_log_approx(std::istream& in)
+    : dirichlet_prior_opt(in)
+{
+    // nothing
+}
+
+void dirichlet_log_approx::save(std::ostream& out) const
+{
+    io::packed::write(out, id);
+
+    io::packed::write(out, mu_);
+}
+
+
+dirichlet_mackay_peto::dirichlet_mackay_peto(float mu) : dirichlet_prior_opt(mu)
+{
+    // nothing
+}
+
+dirichlet_mackay_peto::dirichlet_mackay_peto(std::istream& in)
+    : dirichlet_prior_opt(in)
+{
+    // nothing
+}
+
+void dirichlet_mackay_peto::save(std::ostream& out) const
+{
+    io::packed::write(out, id);
+
+    io::packed::write(out, mu_);
+}
+
+// optimization methods
+
+std::map<term_id, double> dirichlet_digamma_rec::optimize_mu(docs_data& dd, float eps, int max_iter) {
+    bool all_optimized = false;
+    int iter_num = 0;
+    double D, S;
+    double n_max = dd.docs_counts.rbegin()->first;
+
+    // start values for alpha and alpha_m
+    double alpha = default_mu, alpha_mk_new;
+    std::map<term_id, double> alpha_m = dd.alpha_m;
+
+    while (!all_optimized && iter_num < max_iter){
+        D = 0.0;
+        S = 0.0;
+        all_optimized = true;
+
+        alpha = get_alpha(alpha_m);
+
+        count_d c_d;
+        for (count_d n = 1; n <= n_max; n++){
+            c_d = dd.docs_counts[n];
+
+            D += 1.0/(n - 1 + alpha);
+            S += c_d * D;
+        }
+
+        term_id k;
+        std::map<count_d, count_d> c_k;
+        double S_k;
+        for (auto kv: dd.terms_docs_counts){
+            k = kv.first;
+            c_k = kv.second;
+
+            D = 0.0;
+            S_k = 0.0;
+
+            count_d c_k_n, n_k_max = c_k.rbegin()->first;
+            for (count_d n = 1; n <= n_k_max; n++){
+                c_k_n = c_k[n];
+
+                D += 1.0/(n - 1 + alpha_m[k]);
+                S_k += c_k_n * D;
+            }
+
+            alpha_mk_new = alpha_m[k] * S_k / S;
+
+            if (std::abs(alpha_mk_new - alpha_m[k]) > eps){
+                all_optimized = false;
+            }
+
+            alpha_m[k] = alpha_mk_new;
+        }
+
+        iter_num++;
+    }
+
+    mu_ = get_alpha(alpha_m);
+
+    return alpha_m;
+}
+
+std::map<term_id, double> dirichlet_log_approx::optimize_mu(docs_data& dd, float eps, int max_iter) {
+    bool all_optimized = false;
+    int iter_num = 0;
+    double S, S_k;
+    double n_max = dd.docs_counts.rbegin()->first;
+
+    // start values for alpha and alpha_m
+    double alpha = default_mu, alpha_mk_new;
+    std::map<term_id, double> alpha_m = dd.alpha_m;
+
+    while (!all_optimized && iter_num < max_iter){
+        S = 0.0;
+        all_optimized = true;
+
+        alpha = get_alpha(alpha_m);
+
+        count_d c_d;
+        // TODO: skip the zero docs counts
+        for (count_d n = 1; n <= n_max; n++){
+            c_d = dd.docs_counts[n];
+
+            if (c_d != 0){
+                S += c_d * (1.0/alpha + log(n + alpha - 0.5) - log(alpha + 0.5));
+            }
+        }
+
+        term_id k;
+        std::map<count_d, count_d> c_k;
+        for (auto kv: dd.terms_docs_counts){
+            k = kv.first;
+            c_k = kv.second;
+
+            S_k = 0.0;
+
+            count_d c_k_n, n_k_max = c_k.rbegin()->first;
+            // TODO: skip the zero docs counts
+            for (count_d n = 1; n <= n_k_max; n++){
+                c_k_n = c_k[n];
+
+                if (c_k_n != 0){
+                    S_k += c_k_n * (1.0/alpha_m[k] + log(n + alpha_m[k] - 0.5) - log(alpha_m[k] + 0.5));
+                }
+            }
+
+            alpha_mk_new = alpha_m[k] * S_k / S;
+
+            if (std::abs(alpha_mk_new - alpha_m[k]) > eps){
+                all_optimized = false;
+            }
+
+            alpha_m[k] = alpha_mk_new;
+        }
+
+        iter_num++;
+    }
+
+    mu_ = get_alpha(alpha_m);
+
+    return alpha_m;
+}
+
+std::map<term_id, double> dirichlet_mackay_peto::optimize_mu(docs_data& dd, float eps, int max_iter) {
+    eps = eps;
+    max_iter = max_iter;
+    eps = dd.ref_size;
+    mu_ = 0;
+    std::map<term_id, double> alpha_m;
+
+    return alpha_m;
+}
+
+}
+}

From f7b634a1f1bde37ff4f3043efd2ae373ea335d77 Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Fri, 1 Dec 2017 03:17:25 +0300
Subject: [PATCH 28/30] [opt] + MacKay and Peto method

---
 src/index/ranker/dirichlet_prior_opt.cpp | 59 ++++++++++++++++++++++--
 src/index/ranker/test_opt/test.cpp       | 14 +++---
 2 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/src/index/ranker/dirichlet_prior_opt.cpp b/src/index/ranker/dirichlet_prior_opt.cpp
index 2a2df1ce5..4405719c4 100644
--- a/src/index/ranker/dirichlet_prior_opt.cpp
+++ b/src/index/ranker/dirichlet_prior_opt.cpp
@@ -229,11 +229,60 @@ std::map<term_id, double> dirichlet_log_approx::optimize_mu(docs_data& dd, float
 }
 
 std::map<term_id, double> dirichlet_mackay_peto::optimize_mu(docs_data& dd, float eps, int max_iter) {
-    eps = eps;
-    max_iter = max_iter;
-    eps = dd.ref_size;
-    mu_ = 0;
-    std::map<term_id, double> alpha_m;
+    bool all_optimized = false;
+    int iter_num = 0;
+
+    // start values for alpha and alpha_m
+    double alpha = default_mu, alpha_mk_new;
+    std::map<term_id, double> alpha_m = dd.alpha_m;
+
+    while (!all_optimized && iter_num < max_iter){
+        all_optimized = true;
+
+        alpha = get_alpha(alpha_m);
+
+        // compute K(alpha)
+        double K_alpha = 0;
+        for (auto d_id: dd.doc_ids){
+            double n_d = dd.idx.doc_size(d_id);
+            K_alpha += log((n_d + alpha) / alpha) + 0.5 * n_d / (alpha * (n_d + alpha));
+        }
+
+        term_id k;
+        std::map<count_d, count_d> c_k;
+        for (auto kv: dd.terms_docs_counts){
+            k = kv.first;
+            c_k = kv.second;
+
+            count_d n_k_max = c_k.rbegin()->first;
+
+            // compute V_k
+            count_d V_k = dd.idx.doc_freq(k);
+
+            // compute H_k and G_k
+            double H_k = 0, G_k = 0;
+            count_d N_f = 0;
+            for (count_d f = n_k_max; f >= 2; f--){
+                N_f += c_k[f];
+
+                G_k += (double) N_f / (f - 1.0);
+                H_k += (double) N_f / pow(f - 1.0, 2);
+            }
+
+            // recompute alpha_mk
+            alpha_mk_new = 2 * V_k / (K_alpha - G_k + sqrt(pow(K_alpha - G_k, 2) + 4 * H_k * V_k));
+
+            if (std::abs(alpha_mk_new - alpha_m[k]) > eps){
+                all_optimized = false;
+            }
+
+            alpha_m[k] = alpha_mk_new;
+        }
+
+        iter_num++;
+    }
+
+    mu_ = get_alpha(alpha_m);
 
     return alpha_m;
 }
diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
index 2c27f2427..a12771f53 100644
--- a/src/index/ranker/test_opt/test.cpp
+++ b/src/index/ranker/test_opt/test.cpp
@@ -48,6 +48,7 @@ int main(int argc, char* argv[])
 
     index::dirichlet_digamma_rec ranker1;
     index::dirichlet_log_approx ranker2;
+    index::dirichlet_mackay_peto ranker3;
 
     auto time1 = common::time([&]()
     {
@@ -66,12 +67,13 @@ int main(int argc, char* argv[])
     display_result(alpha, alpha_m, time2.count() / 1.0);
 
 
-//    time = common::time([&]()
-//    {
-//        // Create and make score of optimizer
-//        index::mackay_peto ranker;
-//        std::cout << ranker.get_optimized_mu(*idx) << std::endl;
-//    });
+    auto time3 = common::time([&]()
+    {
+        alpha_m = ranker3.get_optimized_mu(*idx, eps, iters);
+        alpha = ranker3.parameter();
+    });
+
+    display_result(alpha, alpha_m, time3.count() / 1.0);
 
     return 0;
 }

From d4b0a8d9f50dcd6dc1b09b96a2867838345e2660 Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Sun, 3 Dec 2017 14:22:33 +0300
Subject: [PATCH 29/30] [opt] + comments and docs

---
 .../meta/index/ranker/dirichlet_prior_opt.h   | 84 +++++++++++++++----
 1 file changed, 66 insertions(+), 18 deletions(-)

diff --git a/include/meta/index/ranker/dirichlet_prior_opt.h b/include/meta/index/ranker/dirichlet_prior_opt.h
index face18029..351b01b2a 100644
--- a/include/meta/index/ranker/dirichlet_prior_opt.h
+++ b/include/meta/index/ranker/dirichlet_prior_opt.h
@@ -18,19 +18,17 @@ namespace meta
 namespace index
 {
 
-// # TODO: choose template type instead of long
 typedef long count_d;
 
 struct docs_data
 {
-    // general info
-
+    /// inverted index
     const inverted_index& idx;
-    /// ids of all documents
+    /// ids of all documents in the index
     std::vector<doc_id> doc_ids;
-    /// ids of all terms
+    /// ids of all terms in the index
     std::vector<term_id> term_ids;
-    /// total size of documents
+    /// total size of all documents
     count_d ref_size;
     /// C_.(n)
     std::map<count_d, count_d> docs_counts;
@@ -62,18 +60,10 @@ struct docs_data
 
 
 /**
- * Implements Bayesian smoothing with a Dirichlet prior.
+ * Abstract class for Diriclhet prior smoothing with optimized constant mu.
+ * Constant mu is optimized at the stage of scoring documents using information about those documents.
  *
- * Required config parameters:
- * ~~~toml
- * [ranker]
- * method = "dirichlet-prior"
- * ~~~
- *
- * Optional config parameters:
- * ~~~toml
- * mu = 2000.0
- * ~~~
+ * Virtual method optimize_mu(docs_data& dd, float eps, int max_iter) is needed to be overrided in inheritants.
  */
 class dirichlet_prior_opt : public dirichlet_prior{
 public:
@@ -86,6 +76,7 @@ class dirichlet_prior_opt : public dirichlet_prior{
                                      ForwardIterator end,
                                      uint64_t num_results = 10)
     {
+        // optimize mu before scoring
         this->optimize_mu(idx);
 
         return ranker::score(idx, begin, end, num_results);
@@ -107,13 +98,24 @@ class dirichlet_prior_opt : public dirichlet_prior{
     }
 
 private:
+    /**
+     * Extracts information necessary to find optimal mu and wrap it into docs_data.
+     * Then, calls class-specific realization of optimize_mu function.
+     * Found optimal value of mu is written to the member of the class.
+     *
+     * @param idx inverted index
+     * @param eps convergence precision
+     * @param max_iter maximal number of iterations (upper bound)
+     *
+     * @return optimal value [alpha * m_i] for each term
+     */
     std::map<term_id, double> optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) {
         // parse idx and extract what we need
 
         auto docs_ids = idx.docs();
         auto terms_ids = idx.terms();
 
-        // calculate ref_size
+        // calculate total size of all documents
         count_d ref_size = 0;
         for (auto& id : docs_ids)
             ref_size += idx.doc_size(id);
@@ -152,9 +154,31 @@ class dirichlet_prior_opt : public dirichlet_prior{
         return optimize_mu(dd, eps, max_iter);
     }
 
+    /**
+     * Finds optimal mu using information from given docs_data structure.
+     * Writes optimal mu to the corresponding field of the class.
+     *
+     * @param idx inverted index
+     * @param eps convergence precision
+     * @param max_iter maximal number of iterations (upper bound)
+     *
+     * @return optimal value [alpha * m_i] for each term
+     */
     virtual std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) = 0;
 };
 
+/**
+ * Implements Diriclhet Prior smoothing with optimized constant mu.
+ *
+ * Optimization method is Fixed-Point Iteration with digamma recurrence relation
+ * described at: https://people.cs.umass.edu/~wallach/theses/wallach_phd_thesis.pdf, pp. 27-28.
+ *
+ * Required config parameters:
+ * ~~~toml
+ * [ranker]
+ * method = "dirichlet-digamma-rec"
+ * ~~~
+ */
 class dirichlet_digamma_rec: public dirichlet_prior_opt{
 public:
     const static util::string_view id;
@@ -176,6 +200,18 @@ class dirichlet_digamma_rec: public dirichlet_prior_opt{
 
 };
 
+/**
+ * Implements Diriclhet Prior smoothing with optimized constant mu.
+ *
+ * Optimization method is Fixed-Point Iteration with digamma differences log approximation
+ * described at: https://people.cs.umass.edu/~wallach/theses/wallach_phd_thesis.pdf, pp. 28-29.
+ *
+ * Required config parameters:
+ * ~~~toml
+ * [ranker]
+ * method = "dirichlet-log-approx"
+ * ~~~
+ */
 class dirichlet_log_approx: public dirichlet_prior_opt{
 public:
     const static util::string_view id;
@@ -196,6 +232,18 @@ class dirichlet_log_approx: public dirichlet_prior_opt{
     std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override;
 };
 
+/**
+ * Implements Diriclhet Prior smoothing with optimized constant mu.
+ *
+ * Optimization method is MacKay and Peto's Fixed-Point Iteration with efficiently computing N_fk
+ * described at: https://people.cs.umass.edu/~wallach/theses/wallach_phd_thesis.pdf, p. 30.
+ *
+ * Required config parameters:
+ * ~~~toml
+ * [ranker]
+ * method = "dirichlet-mackay-peto"
+ * ~~~
+ */
 class dirichlet_mackay_peto: public dirichlet_prior_opt{
 public:
     const static util::string_view id;

From 001fac6cecbe2b73b5db5888f4056460423f1ac1 Mon Sep 17 00:00:00 2001
From: Alex2304 <alex2304el@gmail.com>
Date: Mon, 4 Dec 2017 12:51:49 +0300
Subject: [PATCH 30/30] [opt] - test files

---
 include/meta/stats/statistics.h          | 102 -----------------------
 src/index/ranker/CMakeLists.txt          |   4 +-
 src/index/ranker/dirichlet_prior.cpp     |   1 -
 src/index/ranker/test_opt/CMakeLists.txt |  10 ---
 src/index/ranker/test_opt/test.cpp       |  79 ------------------
 5 files changed, 1 insertion(+), 195 deletions(-)
 delete mode 100644 src/index/ranker/test_opt/CMakeLists.txt
 delete mode 100644 src/index/ranker/test_opt/test.cpp

diff --git a/include/meta/stats/statistics.h b/include/meta/stats/statistics.h
index 168af1245..ac1fb054e 100644
--- a/include/meta/stats/statistics.h
+++ b/include/meta/stats/statistics.h
@@ -18,108 +18,6 @@ namespace meta
 {
 namespace stats
 {
-
-#ifndef M_PIl
-/** The constant Pi in high precision */
-#define M_PIl 3.1415926535897932384626433832795029L
-#endif
-#ifndef M_GAMMAl
-/** Euler's constant in high precision */
-#define M_GAMMAl 0.5772156649015328606065120900824024L
-#endif
-#ifndef M_LN2l
-/** the natural logarithm of 2 in high precision */
-#define M_LN2l 0.6931471805599453094172321214581766L
-#endif
-
-/** The digamma function in long double precision.
-* @param x the real value of the argument
-* @return the value of the digamma (psi) function at that point
-* @author Richard J. Mathar
-* @since 2005-11-24
-*/
-long double digamma(long double x)
-{
-    /* force into the interval 1..3 */
-    if( x < 0.0L )
-        return digamma(1.0L-x)+M_PIl/tanl(M_PIl*(1.0L-x)) ;	/* reflection formula */
-    else if( x < 1.0L )
-        return digamma(1.0L+x)-1.0L/x ;
-    else if ( x == 1.0L)
-        return -M_GAMMAl ;
-    else if ( x == 2.0L)
-        return 1.0L-M_GAMMAl ;
-    else if ( x == 3.0L)
-        return 1.5L-M_GAMMAl ;
-    else if ( x > 3.0L)
-        /* duplication formula */
-        return 0.5L*(digamma(x/2.0L)+digamma((x+1.0L)/2.0L))+M_LN2l ;
-    else
-    {
-        /* Just for your information, the following lines contain
-        * the Maple source code to re-generate the table that is
-        * eventually becoming the Kncoe[] array below
-        * interface(prettyprint=0) :
-        * Digits := 63 :
-        * r := 0 :
-        *
-        * for l from 1 to 60 do
-        * 	d := binomial(-1/2,l) :
-        * 	r := r+d*(-1)^l*(Zeta(2*l+1) -1) ;
-        * 	evalf(r) ;
-        * 	print(%,evalf(1+Psi(1)-r)) ;
-        *o d :
-        *
-        * for N from 1 to 28 do
-        * 	r := 0 :
-        * 	n := N-1 :
-        *
-        *	for l from iquo(n+3,2) to 70 do
-        *		d := 0 :
-        *		for s from 0 to n+1 do
-        *		 d := d+(-1)^s*binomial(n+1,s)*binomial((s-1)/2,l) :
-        *		od :
-        *		if 2*l-n > 1 then
-        *		r := r+d*(-1)^l*(Zeta(2*l-n) -1) :
-        *		fi :
-        *	od :
-        *	print(evalf((-1)^n*2*r)) ;
-        *od :
-        *quit :
-        */
-        static long double Kncoe[] = { .30459198558715155634315638246624251L,
-        .72037977439182833573548891941219706L, -.12454959243861367729528855995001087L,
-        .27769457331927827002810119567456810e-1L, -.67762371439822456447373550186163070e-2L,
-        .17238755142247705209823876688592170e-2L, -.44817699064252933515310345718960928e-3L,
-        .11793660000155572716272710617753373e-3L, -.31253894280980134452125172274246963e-4L,
-        .83173997012173283398932708991137488e-5L, -.22191427643780045431149221890172210e-5L,
-        .59302266729329346291029599913617915e-6L, -.15863051191470655433559920279603632e-6L,
-        .42459203983193603241777510648681429e-7L, -.11369129616951114238848106591780146e-7L,
-        .304502217295931698401459168423403510e-8L, -.81568455080753152802915013641723686e-9L,
-        .21852324749975455125936715817306383e-9L, -.58546491441689515680751900276454407e-10L,
-        .15686348450871204869813586459513648e-10L, -.42029496273143231373796179302482033e-11L,
-        .11261435719264907097227520956710754e-11L, -.30174353636860279765375177200637590e-12L,
-        .80850955256389526647406571868193768e-13L, -.21663779809421233144009565199997351e-13L,
-        .58047634271339391495076374966835526e-14L, -.15553767189204733561108869588173845e-14L,
-        .41676108598040807753707828039353330e-15L, -.11167065064221317094734023242188463e-15L } ;
-
-        register long double Tn_1 = 1.0L ;	/* T_{n-1}(x), started at n=1 */
-        register long double Tn = x-2.0L ;	/* T_{n}(x) , started at n=1 */
-        register long double resul = Kncoe[0] + Kncoe[1]*Tn ;
-
-        x -= 2.0L ;
-
-        for(int n = 2 ; n < sizeof(Kncoe)/sizeof(long double) ;n++)
-        {
-            const long double Tn1 = 2.0L * x * Tn - Tn_1 ;	/* Chebyshev recursion, Eq. 22.7.4 Abramowitz-Stegun */
-            resul += Kncoe[n]*Tn1 ;
-            Tn_1 = Tn ;
-            Tn = Tn1 ;
-        }
-        return resul ;
-    }
-}
-
 /**
  * Computation for \f$E_d[f(x)]\f$ where \f$d\f$ is specified by the
  * `dist` parameter and \f$f(x)\f$ is the `fun` parameter.
diff --git a/src/index/ranker/CMakeLists.txt b/src/index/ranker/CMakeLists.txt
index f0e699418..84d22701d 100644
--- a/src/index/ranker/CMakeLists.txt
+++ b/src/index/ranker/CMakeLists.txt
@@ -1,10 +1,8 @@
 project(meta-ranker)
 
-add_subdirectory(test_opt)
-
 add_library(meta-ranker absolute_discount.cpp
                         dirichlet_prior.cpp
-						dirichlet_prior_opt.cpp
+                        dirichlet_prior_opt.cpp
                         jelinek_mercer.cpp
                         lm_ranker.cpp
                         okapi_bm25.cpp
diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp
index 3a43297d6..07536afbe 100644
--- a/src/index/ranker/dirichlet_prior.cpp
+++ b/src/index/ranker/dirichlet_prior.cpp
@@ -55,6 +55,5 @@ std::unique_ptr<ranker>
         throw ranker_exception{"dirichlet-prior mu must be >= 0"};
     return make_unique<dirichlet_prior>(mu);
 }
-
 }
 }
diff --git a/src/index/ranker/test_opt/CMakeLists.txt b/src/index/ranker/test_opt/CMakeLists.txt
deleted file mode 100644
index 1e8cdc0ee..000000000
--- a/src/index/ranker/test_opt/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-project(meta-dirichlet-test)
-
-include_directories(../../../../include)
-
-add_executable(test_opt test.cpp)
-
-target_link_libraries(test_opt meta-ranker
-    meta-sequence-analyzers
-    meta-parser-analyzers)
-
diff --git a/src/index/ranker/test_opt/test.cpp b/src/index/ranker/test_opt/test.cpp
deleted file mode 100644
index a12771f53..000000000
--- a/src/index/ranker/test_opt/test.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-#include "meta/corpus/document.h"
-#include "meta/index/ranker/all.h"
-#include "meta/index/forward_index.h"
-
-#include <iostream>
-
-#include "meta/index/inverted_index.h"
-#include "meta/logging/logger.h"
-#include "meta/parser/analyzers/tree_analyzer.h"
-#include "meta/sequence/analyzers/ngram_pos_analyzer.h"
-#include "meta/util/time.h"
-
-using namespace meta;
-
-
-void display_result(float alpha, std::map<term_id, double> alpha_m, float time){
-    for (auto kv: alpha_m){
-        std::cout << kv.second << " ";
-    }
-    std::cout << std::endl << alpha << std::endl << time << std::endl;
-}
-
-int main(int argc, char* argv[])
-{
-    if (argc != 2)
-    {
-        std::cerr << "Usage:\t" << argv[0] << " configFile" << std::endl;
-        return 1;
-    }
-
-    // Turn on logging to std::cerr.
-    logging::set_cerr_logging();
-
-    // Register additional analyzers
-    parser::register_analyzers();
-    sequence::register_analyzers();
-
-    // Creates an inverted index with no cache. We don't need a cache here
-    //  since we're never searching the index, only building it.
-    auto config = cpptoml::parse_file(argv[1]);
-    auto idx = index::make_index<index::inverted_index>(*config);
-
-    double eps = 1e-6;
-    int iters = 10000;
-
-    float alpha;
-    std::map<term_id, double> alpha_m;
-
-    index::dirichlet_digamma_rec ranker1;
-    index::dirichlet_log_approx ranker2;
-    index::dirichlet_mackay_peto ranker3;
-
-    auto time1 = common::time([&]()
-    {
-        alpha_m = ranker1.get_optimized_mu(*idx, eps, iters);
-        alpha = ranker1.parameter();
-    });
-
-    display_result(alpha, alpha_m, time1.count() / 1.0);
-
-    auto time2 = common::time([&]()
-    {
-        alpha_m = ranker2.get_optimized_mu(*idx, eps, iters);
-        alpha = ranker2.parameter();
-    });
-
-    display_result(alpha, alpha_m, time2.count() / 1.0);
-
-
-    auto time3 = common::time([&]()
-    {
-        alpha_m = ranker3.get_optimized_mu(*idx, eps, iters);
-        alpha = ranker3.parameter();
-    });
-
-    display_result(alpha, alpha_m, time3.count() / 1.0);
-
-    return 0;
-}