Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hyperparameters estimation for LDA #193

Open
wants to merge 36 commits into
base: develop
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
1aa7032
+ added optimization.h and .gitignore updated
alex2304 Nov 18, 2017
5e3e23c
[opt] dirichlet_optimizer class, digamma function
alex2304 Nov 19, 2017
b8dbc7d
[opt] minka_fpi method draft
alex2304 Nov 19, 2017
3dc03a8
[opt] optimization.h errors fixed, test without MeTa
alex2304 Nov 20, 2017
eeb9168
[opt] debug output
alex2304 Nov 20, 2017
766754f
Adding optimization.cpp
MakKolts Nov 20, 2017
98c3e7d
Merge branch 'develop' of https://github.com/alex2304/meta into develop
Nov 20, 2017
e9c99df
[opt] classes for methods in dirichlet_prior
alex2304 Nov 29, 2017
b11e704
Merge branch 'develop' of https://github.com/alex2304/meta into develop
Nov 29, 2017
54d7272
Deletion of previous stuff
MakKolts Nov 29, 2017
6585189
Test for dirichlet optimizations
MakKolts Nov 29, 2017
c0a357c
Private/public methods
MakKolts Nov 29, 2017
76d32ae
[opt] test indexes
alex2304 Nov 29, 2017
4ccda58
Interface for methods
MakKolts Nov 29, 2017
248c151
Refactoring of optimization interface
MakKolts Nov 29, 2017
61ece78
[opt] tmp for merge
alex2304 Nov 29, 2017
f979264
Tests for all functions at same time
MakKolts Nov 29, 2017
ba00c86
[opt] + term_ids()
alex2304 Nov 29, 2017
1f13f95
[opt] merged dirichlet_prior
alex2304 Nov 29, 2017
ed475b5
[opt] + first method without testing
alex2304 Nov 30, 2017
4528ec6
[opt] *first method builds
alex2304 Nov 30, 2017
312a485
[opt] * method works
alex2304 Nov 30, 2017
b60cc54
[opt] *first method debugged
alex2304 Nov 30, 2017
0a0851c
[opt] method refactored
alex2304 Nov 30, 2017
d726f70
[opt] + method2
alex2304 Nov 30, 2017
4a6a240
Adding constructors and register for new ranker classes
MakKolts Nov 30, 2017
f55e0de
Merge branch 'develop' of https://github.com/alex2304/meta into develop
MakKolts Nov 30, 2017
bc948ce
Add rankers to factory
MakKolts Nov 30, 2017
78d6d5c
[opt] + benchmark
alex2304 Nov 30, 2017
25d89d1
Merge branch 'develop' of https://github.com/alex2304/meta into develop
alex2304 Nov 30, 2017
5bc6ee6
Minor fix foor output
MakKolts Nov 30, 2017
4f8fa1d
[opt] + dirichlet_opt files
alex2304 Nov 30, 2017
c8ddfbf
[opt] + dirichlet_prior_opt
alex2304 Nov 30, 2017
f7b634a
[opt] + MacKay and Peto method
alex2304 Dec 1, 2017
d4b0a8d
[opt] + comments and docs
alex2304 Dec 3, 2017
001fac6
[opt] - test files
alex2304 Dec 4, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[opt] + first method without testing
alex2304 committed Nov 30, 2017
commit ed475b5339e7dd8dd4012a55f19e9948aaf26a1c
135 changes: 124 additions & 11 deletions include/meta/index/ranker/dirichlet_prior.h
Original file line number Diff line number Diff line change
@@ -74,6 +74,32 @@ class dirichlet_prior : public language_model_ranker
float mu_;
};

struct docs_data
{
// general info

inverted_index& idx;
/// ids of all documents
std::vector<doc_id> doc_ids;
/// ids of all terms
std::vector<term_id> term_ids;

/**
* Constructor to initialize most elements.
* @param p_idx The index that is being used
* @param p_doc_ids ids of all docs
* @param p_term_ids ids of all terms
*/
score_data(inverted_index& p_idx, std::vector<doc_id> p_doc_ids, std::vector<doc_id> p_term_ids,
uint64_t p_total_terms, float p_query_length)
: idx(p_idx), // gcc no non-const ref init from brace init list
doc_ids{p_doc_ids},
term_ids{p_term_ids}
{
/* nothing */
}
};

class dirichlet_prior_opt : public dirichlet_prior{
public:
template <class ForwardIterator>
@@ -88,34 +114,121 @@ class dirichlet_prior_opt : public dirichlet_prior{
}

float get_optimized_mu(const inverted_index& idx) {
optimize(idx);
optimize_mu(idx);

return mu_;
}

private:
void optimize(const inverted_index& idx) {
// TODO: parse idx
void optimize_mu(const inverted_index& idx) {
auto docs_ids = idx.docs();
auto terms_ids = idx.terms();
docs_data dd{idx, docs_ids, terms_ids};

std::cout << idx.unique_terms() << std::endl;
optimize_mu(dd);
// std::cout << idx.unique_terms() << std::endl;

for (auto d_id: docs_ids){
for (auto t_id: terms_ids){
std::cout << idx.term_freq(t_id, d_id) << std::endl;
}
}
// for (auto d_id: docs_ids){
// for (auto t_id: terms_ids){
// std::cout << idx.term_freq(t_id, d_id) << std::endl;
// }
// }

// optimize_mu(std::vector<doc_id> docs_ids,
// idx.unique_terms()
// idx.total_corpus_terms()

}

virtual void optimize_mu(const inverted_index& idx) = 0;
virtual void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) = 0;
};

// # TODO: choose template type instead of long
typedef long count_d;

class digamma_rec: public dirichlet_prior_opt{
void optimize_mu(const inverted_index& idx) override { mu_ = 0;};
void optimize_mu(const docs_data& dd, float eps=1e-6, int max_iter=100) override {
// fill C_.(n) and C_k(n)

std::map<count_d, count_d> docs_counts;
std::map<term_id, std::map<count_d, count_d>> terms_docs_counts;
long doc_size, doc_term_freq;

for (auto d_id: dd.doc_ids){
doc_size = dd.idx.doc_size(d_id);

//// increase number of docs with the given size (C_.(n))
docs_counts[doc_size] += 1;

for (auto t_id: dd.idx.terms(d_id)){
doc_term_freq = dd.idx.term_freq(t_id, d_id);

//// increase number of docs with the given count of word t_id (C_k(n))
terms_docs_counts[t_id][doc_term_freq] += 1;
}
}

// // sort by ascending of occurences
// std::sort(docs_counts.begin(), items.end());
// for (auto key: terms_docs_counts){
// std::sort(key.second.begin(), key.second.end());
// }

// p(w|REF) = dd.idx.total_num_occurences(t_id)

// fill start vector alpha_m
double alpha = 1;
std::map<term_id, double> alpha_m;

for (auto t_id: dd.idx.terms()){
alpha_m[t_id] = dd.idx.total_num_occurences(t_id) * alpha;
}

double D, S;
bool converged = false;

while (!converged){
D = 0;
S = 0;

alpha = 0;
for (auto alpha_m_k: alpha_m){
alpha += alpha_m_k;
}

count_d n, c_d;
for (auto kv: docs_counts){
n = kv.first;
c_d = kv.second;

D += 1/(n - 1 + alpha);
S += c_d * D;
}

std::map<count_d, count_d> c_n;
term_id k;
double S_k;
for (auto kv: terms_docs_counts){
k = kv.first;
c_n = kv.second;

D = 0;
S_k = 0;

count_d n, c_k_n;
for (auto kv_: c_n){
n = kv_.first;
c_k_n = kv_.second;

D += 1/(n - 1 + alpha * m_k);
S_k += c_k_n * D;
}

alpha_m[k] *= S_k / S;
}
}

}
};

class log_approx: public dirichlet_prior_opt{