Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hyperparameters estimation for LDA #193

Open
wants to merge 36 commits into
base: develop
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
1aa7032
+ added optimization.h and .gitignore updated
alex2304 Nov 18, 2017
5e3e23c
[opt] dirichlet_optimizer class, digamma function
alex2304 Nov 19, 2017
b8dbc7d
[opt] minka_fpi method draft
alex2304 Nov 19, 2017
3dc03a8
[opt] optimization.h errors fixed, test without MeTa
alex2304 Nov 20, 2017
eeb9168
[opt] debug output
alex2304 Nov 20, 2017
766754f
Adding optimization.cpp
MakKolts Nov 20, 2017
98c3e7d
Merge branch 'develop' of https://github.com/alex2304/meta into develop
Nov 20, 2017
e9c99df
[opt] classes for methods in dirichlet_prior
alex2304 Nov 29, 2017
b11e704
Merge branch 'develop' of https://github.com/alex2304/meta into develop
Nov 29, 2017
54d7272
Deletion of previous stuff
MakKolts Nov 29, 2017
6585189
Test for dirichlet optimizations
MakKolts Nov 29, 2017
c0a357c
Private/public methods
MakKolts Nov 29, 2017
76d32ae
[opt] test indexes
alex2304 Nov 29, 2017
4ccda58
Interface for methods
MakKolts Nov 29, 2017
248c151
Refactoring of optimization interface
MakKolts Nov 29, 2017
61ece78
[opt] tmp for merge
alex2304 Nov 29, 2017
f979264
Tests for all functions at same time
MakKolts Nov 29, 2017
ba00c86
[opt] + term_ids()
alex2304 Nov 29, 2017
1f13f95
[opt] merged dirichlet_prior
alex2304 Nov 29, 2017
ed475b5
[opt] + first method without testing
alex2304 Nov 30, 2017
4528ec6
[opt] *first method builds
alex2304 Nov 30, 2017
312a485
[opt] * method works
alex2304 Nov 30, 2017
b60cc54
[opt] *first method debugged
alex2304 Nov 30, 2017
0a0851c
[opt] method refactored
alex2304 Nov 30, 2017
d726f70
[opt] + method2
alex2304 Nov 30, 2017
4a6a240
Adding constructors and register for new ranker classes
MakKolts Nov 30, 2017
f55e0de
Merge branch 'develop' of https://github.com/alex2304/meta into develop
MakKolts Nov 30, 2017
bc948ce
Add rankers to factory
MakKolts Nov 30, 2017
78d6d5c
[opt] + benchmark
alex2304 Nov 30, 2017
25d89d1
Merge branch 'develop' of https://github.com/alex2304/meta into develop
alex2304 Nov 30, 2017
5bc6ee6
Minor fix foor output
MakKolts Nov 30, 2017
4f8fa1d
[opt] + dirichlet_opt files
alex2304 Nov 30, 2017
c8ddfbf
[opt] + dirichlet_prior_opt
alex2304 Nov 30, 2017
f7b634a
[opt] + MacKay and Peto method
alex2304 Dec 1, 2017
d4b0a8d
[opt] + comments and docs
alex2304 Dec 3, 2017
001fac6
[opt] - test files
alex2304 Dec 4, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[opt] + dirichlet_opt files
  • Loading branch information
alex2304 committed Nov 30, 2017
commit 4f8fa1d59e3dc9f862a491a08b744ed9a789fd8b
1 change: 1 addition & 0 deletions include/meta/index/ranker/all.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "meta/index/ranker/ranker.h"
#include "meta/index/ranker/absolute_discount.h"
#include "meta/index/ranker/dirichlet_prior.h"
#include "meta/index/ranker/dirichlet_prior_opt.h"
#include "meta/index/ranker/jelinek_mercer.h"
#include "meta/index/ranker/lm_ranker.h"
#include "meta/index/ranker/okapi_bm25.h"
331 changes: 0 additions & 331 deletions include/meta/index/ranker/dirichlet_prior.h
Original file line number Diff line number Diff line change
@@ -12,8 +12,6 @@
#include "meta/index/ranker/lm_ranker.h"
#include "meta/index/ranker/ranker_factory.h"

#include <cmath>

namespace meta
{
namespace index
@@ -79,342 +77,13 @@ class dirichlet_prior : public language_model_ranker
};


// # TODO: choose template type instead of long
typedef long count_d;

struct docs_data
{
// general info

const inverted_index& idx;
/// ids of all documents
std::vector<doc_id> doc_ids;
/// ids of all terms
std::vector<term_id> term_ids;
/// total size of documents
count_d ref_size;
/// C_.(n)
std::map<count_d, count_d> docs_counts;
/// C_k(n)
std::map<term_id, std::map<count_d, count_d>> terms_docs_counts;
/// vector alpha_m
std::map<term_id, double> alpha_m;

/**
* Constructor to initialize most elements.
* @param p_idx The index that is being used
* @param p_doc_ids ids of all docs
* @param p_term_ids ids of all terms
*/
docs_data(const inverted_index& p_idx, std::vector<doc_id> p_doc_ids, std::vector<term_id> p_term_ids, count_d p_ref_size,
std::map<count_d, count_d> p_docs_counts, std::map<term_id, std::map<count_d, count_d>> p_terms_docs_counts,
std::map<term_id, double> p_alpha_m)
: idx(p_idx), // gcc no non-const ref init from brace init list
doc_ids{p_doc_ids},
term_ids{p_term_ids},
ref_size{p_ref_size},
docs_counts{p_docs_counts},
terms_docs_counts{p_terms_docs_counts},
alpha_m{p_alpha_m}
{
/* nothing */
}
};

class dirichlet_prior_opt : public dirichlet_prior{
public:

dirichlet_prior_opt(float mu) : dirichlet_prior(mu) { }

dirichlet_prior_opt(std::istream& in) : dirichlet_prior(in) { }

template <class ForwardIterator>
std::vector<search_result> score(inverted_index& idx, ForwardIterator begin,
ForwardIterator end,
uint64_t num_results = 10)
{
// optimize mu according to ranker_context before ranking
this->optimize_mu(idx);

return ranker::score(idx, begin, end, num_results);
}

std::map<term_id, double> get_optimized_mu(const inverted_index& idx, float eps, int max_iter) {
return optimize_mu(idx, eps, max_iter);
}

protected:
inline double get_alpha(std::map<term_id, double> alpha_m){
double alpha = 0;

for (auto alpha_m_k: alpha_m){
alpha += alpha_m_k.second;
}

return alpha;
}

private:
std::map<term_id, double> optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) {
// parse idx and extract what we need
auto docs_ids = idx.docs();
auto terms_ids = idx.terms();

// calculate ref_size
count_d ref_size = 0;
for (auto& id : docs_ids)
ref_size += idx.doc_size(id);

// calculate C_.(n) and C_k(n)
std::map<count_d, count_d> docs_counts;
std::map<term_id, std::map<count_d, count_d>> terms_docs_counts;

long doc_size, doc_term_freq;
for (auto d_id: docs_ids){
doc_size = idx.doc_size(d_id);

//// increase number of docs with the given size (C_.(n))
docs_counts[doc_size] += 1;

for (auto t_id: terms_ids){
doc_term_freq = idx.term_freq(t_id, d_id);

//// increase number of docs with the given count of word t_id (C_k(n))
terms_docs_counts[t_id][doc_term_freq] += 1;
}
}

// fill start vector alpha_m
std::map<term_id, double> alpha_m;

for (auto t_id: terms_ids){
alpha_m[t_id] = idx.total_num_occurences(t_id) * default_mu;
alpha_m[t_id] /= (double)ref_size;
}

// create docs_data
docs_data dd{idx, docs_ids, terms_ids, ref_size, docs_counts, terms_docs_counts, alpha_m};

// call optimizer
return optimize_mu(dd, eps, max_iter);
}

virtual std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) = 0;
};

class dirichlet_digamma_rec: public dirichlet_prior_opt{
public:
const static util::string_view id;

/**
* @param mu
*/
dirichlet_digamma_rec(float mu = default_mu);

/**
* Loads a dirichlet_prior ranker from a stream.
* @param in The stream to read from
*/
dirichlet_digamma_rec(std::istream& in);

void save(std::ostream& out) const override;
private:
std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override {
bool all_optimized = false;
int iter_num = 0;
double D, S;
double n_max = dd.docs_counts.rbegin()->first;

// start values for alpha and alpha_m
double alpha = default_mu, alpha_mk_new;
std::map<term_id, double> alpha_m = dd.alpha_m;

while (!all_optimized && iter_num < max_iter){
D = 0.0;
S = 0.0;
all_optimized = true;

alpha = get_alpha(alpha_m);

count_d c_d;
for (count_d n = 1; n <= n_max; n++){
c_d = dd.docs_counts[n];

D += 1.0/(n - 1 + alpha);
S += c_d * D;
}

term_id k;
std::map<count_d, count_d> c_k;
double S_k;
for (auto kv: dd.terms_docs_counts){
k = kv.first;
c_k = kv.second;

D = 0.0;
S_k = 0.0;

count_d c_k_n, n_k_max = c_k.rbegin()->first;
for (count_d n = 1; n <= n_k_max; n++){
c_k_n = c_k[n];

D += 1.0/(n - 1 + alpha_m[k]);
S_k += c_k_n * D;
}

alpha_mk_new = alpha_m[k] * S_k / S;

if (std::abs(alpha_mk_new - alpha_m[k]) > eps){
all_optimized = false;
}

alpha_m[k] = alpha_mk_new;
}

iter_num++;
}

mu_ = get_alpha(alpha_m);

return alpha_m;
}

};

class dirichlet_log_approx: public dirichlet_prior_opt{
public:
const static util::string_view id;

/**
* @param mu
*/
dirichlet_log_approx(float mu = default_mu);

/**
* Loads a dirichlet_prior ranker from a stream.
* @param in The stream to read from
*/
dirichlet_log_approx(std::istream& in);

void save(std::ostream& out) const override;
private:
std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override {
bool all_optimized = false;
int iter_num = 0;
double S, S_k;
double n_max = dd.docs_counts.rbegin()->first;

// start values for alpha and alpha_m
double alpha = default_mu, alpha_mk_new;
std::map<term_id, double> alpha_m = dd.alpha_m;

while (!all_optimized && iter_num < max_iter){
S = 0.0;
all_optimized = true;

alpha = get_alpha(alpha_m);

count_d c_d;
// TODO: skip the zero docs counts
for (count_d n = 1; n <= n_max; n++){
c_d = dd.docs_counts[n];

if (c_d != 0){
S += c_d * (1.0/alpha + log(n + alpha - 0.5) - log(alpha + 0.5));
}
}

term_id k;
std::map<count_d, count_d> c_k;
for (auto kv: dd.terms_docs_counts){
k = kv.first;
c_k = kv.second;

S_k = 0.0;

count_d c_k_n, n_k_max = c_k.rbegin()->first;
// TODO: skip the zero docs counts
for (count_d n = 1; n <= n_k_max; n++){
c_k_n = c_k[n];

if (c_k_n != 0){
S_k += c_k_n * (1.0/alpha_m[k] + log(n + alpha_m[k] - 0.5) - log(alpha_m[k] + 0.5));
}
}

alpha_mk_new = alpha_m[k] * S_k / S;

if (std::abs(alpha_mk_new - alpha_m[k]) > eps){
all_optimized = false;
}

alpha_m[k] = alpha_mk_new;
}

iter_num++;
}

mu_ = get_alpha(alpha_m);

return alpha_m;
}
};

class dirichlet_mackay_peto: public dirichlet_prior_opt{
public:
const static util::string_view id;

/**
* @param mu
*/
dirichlet_mackay_peto(float mu = default_mu);

/**
* Loads a dirichlet_prior ranker from a stream.
* @param in The stream to read from
*/
dirichlet_mackay_peto(std::istream& in);

void save(std::ostream& out) const override;
private:
std::map<term_id, double> optimize_mu(docs_data& dd, float eps, int max_iter) override {
eps = eps;
max_iter = max_iter;
eps = dd.ref_size;
mu_ = 0;
std::map<term_id, double> alpha_m;

return alpha_m;
}
};

/**
* Specialization of the factory method used to create dirichlet_prior
* rankers.
*/
template <>
std::unique_ptr<ranker> make_ranker<dirichlet_prior>(const cpptoml::table&);

/**
* Specialization of the factory method used to create dirichlet_digamma_rec
* rankers.
*/
template <>
std::unique_ptr<ranker> make_ranker<dirichlet_digamma_rec>(const cpptoml::table&);

/**
* Specialization of the factory method used to create dirichlet_log_approx
* rankers.
*/
template <>
std::unique_ptr<ranker> make_ranker<dirichlet_log_approx>(const cpptoml::table&);

/**
* Specialization of the factory method used to create dirichlet_mackay_peto
* rankers.
*/
template <>
std::unique_ptr<ranker> make_ranker<dirichlet_mackay_peto>(const cpptoml::table&);
}
}
#endif
1 change: 1 addition & 0 deletions src/index/ranker/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@ add_subdirectory(test_opt)

add_library(meta-ranker absolute_discount.cpp
dirichlet_prior.cpp
dirichlet_prior_opt.cpp
jelinek_mercer.cpp
lm_ranker.cpp
okapi_bm25.cpp
Loading