From eac7a4c2545c30372d1040031bccec2582bb128f Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 26 Aug 2014 19:09:33 -0500 Subject: [PATCH 001/481] refactor language_model functions to use analysis object (adding log likelihood) --- include/lm/language_model.h | 21 +++++++++++++++++++-- lm-test.cpp | 20 ++++++++++++-------- src/lm/language_model.cpp | 34 ++++++++++++++++++++++++++-------- 3 files changed, 57 insertions(+), 18 deletions(-) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 04e15f268..dd197dac1 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -13,6 +13,7 @@ #include #include #include +#include namespace meta { @@ -21,6 +22,10 @@ namespace lm class language_model { public: + /// An analysis consists of each n-gram from a sequence and its probability + // under the current model. + using lm_analysis = std::vector, double>>; + /** * Creates an N-gram language model based on the corpus specified in the * config file. @@ -63,6 +68,13 @@ class language_model */ double perplexity_per_word(const std::string& tokens) const; + /** + * @param tokens A sequence of tokens + * @return the log likelihood of the sequence of tokens under this + * language model + */ + double log_likelihood(const std::string& tokens) const; + /** * @param tokens A sequence of n tokens * @return the probability of seeing the nth token based on the previous n @@ -70,8 +82,13 @@ class language_model */ double prob(std::deque tokens) const; - private: + /** + * @param tokens A sequence of tokens + * @return statistical information about each n-gram of the sequence + */ + lm_analysis analysis(const std::string& tokens) const; + private: /** * Builds the probabilities associated with this language model. * @param config_file The config file that specifies the location of the @@ -102,7 +119,7 @@ class language_model size_t N_; /// The interpolation coefficient for smoothing LM probabilities - constexpr static double lambda_ = 0.7; + constexpr static double lambda_ = 0.85; }; } } diff --git a/lm-test.cpp b/lm-test.cpp index 16620a1c8..b2422f2f0 100644 --- a/lm-test.cpp +++ b/lm-test.cpp @@ -9,16 +9,18 @@ using namespace meta; +template +std::string make_string(const C& cont) +{ + std::string ret = ""; + for(auto& e: cont) + ret += e + " "; + return ret; +} + int main(int argc, char* argv[]) { lm::language_model model{argv[1], 3}; - for (size_t i = 1; i < 10; ++i) - { - auto sentence = model.generate(i); - std::cout << sentence << std::endl; - std::cout << " -> perplexity_per_word: " - << model.perplexity_per_word(sentence) << std::endl; - } std::cout << "Input a sentence to score (blank to quit):" << std::endl; std::string line; @@ -26,6 +28,8 @@ int main(int argc, char* argv[]) { std::cout << "> "; std::getline(std::cin, line); - std::cout << model.perplexity_per_word(line) << std::endl; + auto lma = model.analysis(line); + for(auto& p: lma) + std::cout << make_string(p.first) << ": " << p.second << std::endl; } } diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 0b567c6d9..30df00b15 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -30,8 +30,9 @@ language_model::language_model(const std::string& config_file) auto config = cpptoml::parse_file(config_file); auto group = config.get_group("language-model"); auto nval = group->get_as("n-value"); - if(!nval) - throw std::runtime_error{"no n-value specified in language-model group"}; + if (!nval) + throw std::runtime_error{ + "no n-value specified in language-model group"}; N_ = *nval; @@ -41,8 +42,7 @@ language_model::language_model(const std::string& config_file) learn_model(config_file); } -language_model::language_model(const std::string& config_file, size_t n): - N_{n} +language_model::language_model(const std::string& config_file, size_t n) : N_{n} { if (N_ > 1) interp_ = make_unique(config_file, N_ - 1); @@ -169,23 +169,41 @@ double language_model::prob(std::deque tokens) const return lambda_ * prob->second + (1.0 - lambda_) * interp_prob; } -double language_model::perplexity(const std::string& tokens) const +auto language_model::analysis(const std::string& tokens) const -> lm_analysis { std::deque ngram; for (size_t i = 1; i < N_; ++i) ngram.push_back(""); - double perp = 0.0; - for (auto& token : make_deque(tokens)) + lm_analysis result; + auto dq = make_deque(tokens); + dq.push_back(""); + for (auto& token : dq) { ngram.push_back(token); - perp += std::log(1.0 + 1.0 / prob(ngram)); + result.emplace_back(ngram, prob(ngram)); ngram.pop_front(); } + return result; +} + +double language_model::perplexity(const std::string& tokens) const +{ + double perp = 0.0; + for (auto& p : analysis(tokens)) + perp += std::log(1.0 + 1.0 / p.second); return std::pow(perp, 1.0 / N_); } +double language_model::log_likelihood(const std::string& tokens) const +{ + double like = 0.0; + for (auto& p : analysis(tokens)) + like += std::log(1.0 + p.second); + return like; +} + double language_model::perplexity_per_word(const std::string& tokens) const { return perplexity(tokens) / tokens.size(); From 35c58909e046f3d047df61be52d38808e6426bcb Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 26 Aug 2014 20:40:19 -0500 Subject: [PATCH 002/481] add function to get top k most likely next words --- include/lm/language_model.h | 8 ++++++++ src/lm/language_model.cpp | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index dd197dac1..432303db5 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -53,6 +53,14 @@ class language_model std::string next_token(const std::deque& tokens, double random) const; + /** + * @param prev A sequence of n - 1 tokens preceding the desired token + * @param k + * @return a list of up to k most likely tokens to come next + */ + std::vector> + top_k(const std::deque& prev, size_t k) const; + /** * @param tokens A sequence of tokens * @return the perplexity of this token sequence given the current language diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 30df00b15..cf1f31ecd 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -118,6 +118,34 @@ std::string language_model::next_token(const std::deque& tokens, throw std::runtime_error{"could not generate next token: " + str}; } +std::vector> + language_model::top_k(const std::deque& prev, size_t k) const +{ + if (prev.size() != N_ - 1) + throw std::runtime_error{"prev should contain n - 1 tokens"}; + + auto it = dist_.find(make_string(prev)); + if (it == dist_.end()) + throw std::runtime_error{"no transitions found"}; + + using pair_t = std::pair; + std::vector probs{it->second.begin(), it->second.end()}; + + auto comp = [&](const pair_t& a, const pair_t& b) + { return a.second > b.second; }; + if (k >= probs.size()) + { + std::sort(probs.begin(), probs.end(), comp); + return probs; + } + + std::nth_element(probs.begin(), probs.begin() + k, probs.end(), comp); + std::vector sorted{probs.begin(), probs.begin() + k}; + std::sort(sorted.begin(), sorted.end(), comp); + + return sorted; +} + std::string language_model::generate(unsigned int seed) const { std::default_random_engine gen(seed); From 631db209e93d3229436ea55e98cc5a940a20118f Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 9 Sep 2014 17:00:10 -0500 Subject: [PATCH 003/481] work on lm-test exe --- src/lm/tools/lm-test.cpp | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index b2422f2f0..d3439626b 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -4,6 +4,7 @@ */ #include +#include #include "meta.h" #include "lm/language_model.h" @@ -13,7 +14,7 @@ template std::string make_string(const C& cont) { std::string ret = ""; - for(auto& e: cont) + for (auto& e : cont) ret += e + " "; return ret; } @@ -29,7 +30,30 @@ int main(int argc, char* argv[]) std::cout << "> "; std::getline(std::cin, line); auto lma = model.analysis(line); - for(auto& p: lma) + for (auto& p : lma) std::cout << make_string(p.first) << ": " << p.second << std::endl; + + std::cout << std::endl; + + using pair_t = decltype(lma[0]); + auto it = std::min_element(lma.begin(), lma.end(), + [&](const pair_t& a, const pair_t& b) { + return a.second < b.second; + } + ); + + auto bad_one = it->first; + std::cout << "You might want to modify/remove the word \"" + << bad_one.back() << "\"" << std::endl; + + bad_one.pop_back(); + std::cout << "Candidates:" << std::endl; + size_t i = 1; + for(auto& p: model.top_k(bad_one, 5)) + { + + std::cout << " " << i << ". " << p.first << std::endl; + ++i; + } } } From 8612a8293f51460f149da0b4b740c365f9d5dc68 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 29 Sep 2014 14:53:34 -0500 Subject: [PATCH 004/481] remove analysis component and fix errors --- include/lm/language_model.h | 45 ++++++-------- src/lm/language_model.cpp | 115 +++++++++++++++++++++++++----------- src/lm/tools/lm-test.cpp | 47 ++------------- 3 files changed, 106 insertions(+), 101 deletions(-) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index b7189e445..5d7268ab1 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -11,10 +11,10 @@ #define META_LANGUAGE_MODEL_H_ #include +#include #include #include #include -#include namespace meta { @@ -23,10 +23,6 @@ namespace lm class language_model { public: - /// An analysis consists of each n-gram from a sequence and its probability - // under the current model. - using lm_analysis = std::vector, double>>; - /** * Creates an N-gram language model based on the corpus specified in the * config file. @@ -54,14 +50,6 @@ class language_model std::string next_token(const std::deque& tokens, double random) const; - /** - * @param prev A sequence of n - 1 tokens preceding the desired token - * @param k - * @return a list of up to k most likely tokens to come next - */ - std::vector> - top_k(const std::deque& prev, size_t k) const; - /** * @param tokens A sequence of tokens * @return the perplexity of this token sequence given the current language @@ -77,13 +65,6 @@ class language_model */ double perplexity_per_word(const std::string& tokens) const; - /** - * @param tokens A sequence of tokens - * @return the log likelihood of the sequence of tokens under this - * language model - */ - double log_likelihood(const std::string& tokens) const; - /** * @param tokens A sequence of n tokens * @return the probability of seeing the nth token based on the previous n @@ -92,10 +73,12 @@ class language_model double prob(std::deque tokens) const; /** - * @param tokens A sequence of tokens - * @return statistical information about each n-gram of the sequence + * @param prev Seen tokens to base the next token off of + * @param k Number of results to return + * @return a sorted vector of likely next tokens */ - lm_analysis analysis(const std::string& tokens) const; + std::vector> + top_k(const std::deque& prev, size_t k) const; private: /** @@ -105,6 +88,16 @@ class language_model */ void learn_model(const std::string& config_file); + /** + * @param config_file + */ + void select_method(const std::string& config_file); + + /** + * @param prefix Path to where the counts files are stored + */ + void read_precomputed(const std::string& prefix); + /** * @param tokens A deque of tokens to convert to a string * @return the string version of the deque (space delimited) @@ -121,14 +114,14 @@ class language_model std::unique_ptr interp_; /// Contains the N-gram distribution probabilities (N-1 words -> (w, prob)) - std::unordered_map - > dist_; + std::unordered_map> + dist_; /// The value of N in this n-gram size_t N_; /// The interpolation coefficient for smoothing LM probabilities - constexpr static double lambda_ = 0.85; + constexpr static double lambda_ = 0.7; }; } } diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index cf1f31ecd..c35884c48 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -29,7 +29,7 @@ language_model::language_model(const std::string& config_file) { auto config = cpptoml::parse_file(config_file); auto group = config.get_group("language-model"); - auto nval = group->get_as("n-value"); + auto nval = group->get_as("n-value"); if (!nval) throw std::runtime_error{ "no n-value specified in language-model group"}; @@ -39,21 +39,45 @@ language_model::language_model(const std::string& config_file) if (N_ > 1) interp_ = make_unique(config_file, N_ - 1); - learn_model(config_file); + select_method(config_file); } -language_model::language_model(const std::string& config_file, size_t n) : N_{n} +void language_model::select_method(const std::string& config_file) +{ + std::cout << "Creating " << N_ << "-gram language model" << std::endl; + + auto config = cpptoml::parse_file(config_file); + auto group = config.get_group("language-model"); + auto format = group->get_as("format"); + if (!format) + throw std::runtime_error{"no format specified in language-model group"}; + + if (*format == "precomputed") + { + auto prefix = group->get_as("prefix"); + if (!prefix) + throw std::runtime_error{ + "no prefix specified for precomputed language model"}; + read_precomputed(*prefix); + } + else if (*format == "learn") + learn_model(config_file); + else + throw std::runtime_error{ + "language-model format could not be determined"}; +} + +language_model::language_model(const std::string& config_file, size_t n): + N_{n} { if (N_ > 1) interp_ = make_unique(config_file, N_ - 1); - learn_model(config_file); + select_method(config_file); } void language_model::learn_model(const std::string& config_file) { - std::cout << "Creating " << N_ << "-gram language model" << std::endl; - auto corpus = corpus::corpus::load(config_file); using namespace analyzers; @@ -99,6 +123,47 @@ void language_model::learn_model(const std::string& config_file) } } +void language_model::read_precomputed(const std::string& prefix) +{ + std::ifstream in{prefix + std::to_string(N_) + "-grams.txt"}; + std::string line; + uint64_t count; + while (in) + { + std::getline(in, line); + std::istringstream iss{line}; + iss >> count; + std::deque ngram; + std::string token; + for (size_t i = 0; i < N_ - 1; ++i) + { + iss >> token; + ngram.push_back(token); + } + + // if there is one remaining token to read + if (iss) + { + iss >> token; + dist_[make_string(ngram)][token] = count; + } + else // if unigram + { + dist_[""][make_string(ngram)] = count; + } + } + + // turn counts into probabilities + for (auto& map : dist_) + { + double sum = 0.0; + for (auto& end : map.second) + sum += end.second; + for (auto& end : map.second) + end.second /= sum; + } +} + std::string language_model::next_token(const std::deque& tokens, double random) const { @@ -132,7 +197,9 @@ std::vector> std::vector probs{it->second.begin(), it->second.end()}; auto comp = [&](const pair_t& a, const pair_t& b) - { return a.second > b.second; }; + { + return a.second > b.second; + }; if (k >= probs.size()) { std::sort(probs.begin(), probs.end(), comp); @@ -197,48 +264,30 @@ double language_model::prob(std::deque tokens) const return lambda_ * prob->second + (1.0 - lambda_) * interp_prob; } -auto language_model::analysis(const std::string& tokens) const -> lm_analysis +double language_model::perplexity(const std::string& tokens) const { std::deque ngram; for (size_t i = 1; i < N_; ++i) ngram.push_back(""); - lm_analysis result; - auto dq = make_deque(tokens); - dq.push_back(""); - for (auto& token : dq) + double perp = 0.0; + for (auto& token : make_deque(tokens)) { ngram.push_back(token); - result.emplace_back(ngram, prob(ngram)); + perp += std::log(1.0 + 1.0 / prob(ngram)); ngram.pop_front(); } - return result; -} - -double language_model::perplexity(const std::string& tokens) const -{ - double perp = 0.0; - for (auto& p : analysis(tokens)) - perp += std::log(1.0 + 1.0 / p.second); return std::pow(perp, 1.0 / N_); } -double language_model::log_likelihood(const std::string& tokens) const -{ - double like = 0.0; - for (auto& p : analysis(tokens)) - like += std::log(1.0 + p.second); - return like; -} - double language_model::perplexity_per_word(const std::string& tokens) const { return perplexity(tokens) / tokens.size(); } -std::deque language_model::make_deque(const std::string - & tokens) const +std::deque + language_model::make_deque(const std::string& tokens) const { std::deque d; std::stringstream sstream{tokens}; @@ -249,8 +298,8 @@ std::deque language_model::make_deque(const std::string return d; } -std::string language_model::make_string(const std::deque - & tokens) const +std::string + language_model::make_string(const std::deque& tokens) const { std::string result{""}; if (tokens.empty()) diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index d3439626b..6cad71eb4 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -4,56 +4,19 @@ */ #include -#include #include "meta.h" #include "lm/language_model.h" using namespace meta; -template -std::string make_string(const C& cont) -{ - std::string ret = ""; - for (auto& e : cont) - ret += e + " "; - return ret; -} - int main(int argc, char* argv[]) { lm::language_model model{argv[1], 3}; - - std::cout << "Input a sentence to score (blank to quit):" << std::endl; - std::string line; - while (true) + for (size_t i = 1; i < 10; ++i) { - std::cout << "> "; - std::getline(std::cin, line); - auto lma = model.analysis(line); - for (auto& p : lma) - std::cout << make_string(p.first) << ": " << p.second << std::endl; - - std::cout << std::endl; - - using pair_t = decltype(lma[0]); - auto it = std::min_element(lma.begin(), lma.end(), - [&](const pair_t& a, const pair_t& b) { - return a.second < b.second; - } - ); - - auto bad_one = it->first; - std::cout << "You might want to modify/remove the word \"" - << bad_one.back() << "\"" << std::endl; - - bad_one.pop_back(); - std::cout << "Candidates:" << std::endl; - size_t i = 1; - for(auto& p: model.top_k(bad_one, 5)) - { - - std::cout << " " << i << ". " << p.first << std::endl; - ++i; - } + auto sentence = model.generate(i); + std::cout << sentence << std::endl; + std::cout << " -> perplexity_per_word: " + << model.perplexity_per_word(sentence) << std::endl; } } From 2a904ce947e4e4439a9d60416a6966f41477c109 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 29 Sep 2014 19:42:33 -0500 Subject: [PATCH 005/481] replace std::deque + std::string with lm::sentence --- include/lm/language_model.h | 31 +++-------- include/lm/sentence.h | 108 ++++++++++++++++++++++++++++++++++++ src/lm/CMakeLists.txt | 3 +- src/lm/language_model.cpp | 77 +++++++++---------------- src/lm/sentence.cpp | 88 +++++++++++++++++++++++++++++ src/lm/tools/lm-test.cpp | 30 ++++++++-- 6 files changed, 257 insertions(+), 80 deletions(-) create mode 100644 include/lm/sentence.h create mode 100644 src/lm/sentence.cpp diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 5d7268ab1..c2d969de2 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -10,11 +10,11 @@ #ifndef META_LANGUAGE_MODEL_H_ #define META_LANGUAGE_MODEL_H_ -#include #include #include #include #include +#include "lm/sentence.h" namespace meta { @@ -43,34 +43,33 @@ class language_model std::string generate(unsigned int seed) const; /** - * @param tokens The previous N - 1 tokens + * @param sentence The previous N - 1 tokens * @param random A random number on [0, 1] used for choosing the next token * @return the next token based on the previous tokens */ - std::string next_token(const std::deque& tokens, - double random) const; + std::string next_token(const sentence& sen, double random) const; /** - * @param tokens A sequence of tokens + * @param sentence A sequence of tokens * @return the perplexity of this token sequence given the current language * model: \f$ \sqrt[n]{\prod_{i=1}^n\frac{1}{p(w_i|w_{i-n}\cdots w_{i-1})}} * \f$ */ - double perplexity(const std::string& tokens) const; + double perplexity(const sentence& tokens) const; /** - * @param tokens A sequence of tokens + * @param sentence A sequence of tokens * @return the perplexity of this token sequence given the current language * model normalized by the length of the sequence */ - double perplexity_per_word(const std::string& tokens) const; + double perplexity_per_word(const sentence& tokens) const; /** * @param tokens A sequence of n tokens * @return the probability of seeing the nth token based on the previous n * - 1 tokens */ - double prob(std::deque tokens) const; + double prob(sentence tokens) const; /** * @param prev Seen tokens to base the next token off of @@ -78,7 +77,7 @@ class language_model * @return a sorted vector of likely next tokens */ std::vector> - top_k(const std::deque& prev, size_t k) const; + top_k(const sentence& prev, size_t k) const; private: /** @@ -98,18 +97,6 @@ class language_model */ void read_precomputed(const std::string& prefix); - /** - * @param tokens A deque of tokens to convert to a string - * @return the string version of the deque (space delimited) - */ - std::string make_string(const std::deque& tokens) const; - - /** - * @param tokens A string of space-delimited tokens to convert to a deque - * @return a deque of the tokens - */ - std::deque make_deque(const std::string& tokens) const; - /// The language_model used to interpolate with this one for smoothing std::unique_ptr interp_; diff --git a/include/lm/sentence.h b/include/lm/sentence.h new file mode 100644 index 000000000..32ed89eb9 --- /dev/null +++ b/include/lm/sentence.h @@ -0,0 +1,108 @@ +/** + * @file sentence.h + * @author Sean Massung + */ + +#ifndef META_SENTENCE_H_ +#define META_SENTENCE_H_ + +#include +#include + +namespace meta +{ +namespace lm +{ +class sentence +{ + public: + using iterator = std::deque::iterator; + using const_iterator = std::deque::const_iterator; + using size_type = std::deque::size_type; + + /** + * Default constructor; an empty sentence. + */ + sentence() = default; + + /** + * Creates a sentence based on a text string, parsed with the default filter + * chain. + * @param text + */ + sentence(const std::string& text); + + /** + * @return a string representation of this sentence + */ + std::string to_string() const; + + /** + * @param idx + * @return the token at the specified index + */ + const std::string& operator[](size_type idx) const; + + /** + * @param idx + * @param token + * @return replace the token at the specified index with the provided token + */ + void substitute(size_type idx, const std::string& token); + + /** + * @param idx Index of the token to remove from this sentence + */ + void remove(size_type idx); + + /** + * @param idx Index to insert a token in front of (to insert at beginning, + * idx = 0) + * @param token + */ + void insert(size_type idx, const std::string& token); + + std::string front() const; + + std::string back() const; + + void push_front(const std::string& token); + + void pop_front(); + + void push_back(const std::string& token); + + void pop_back(); + + /** + * @return an iterator to the beginning of the sequence + */ + iterator begin(); + + /** + * @return an iterator to the end of the sequence + */ + iterator end(); + + /** + * @return a const_iterator to the beginning of the sequence + */ + const_iterator begin() const; + + /** + * @return a const_iterator to the end of the sequence + */ + const_iterator end() const; + + /** + * @return the number of observations in the sequence + */ + size_type size() const; + + private: + std::deque tokens_; +}; +} +} + +#endif diff --git a/src/lm/CMakeLists.txt b/src/lm/CMakeLists.txt index c512aac26..7a943c4b2 100644 --- a/src/lm/CMakeLists.txt +++ b/src/lm/CMakeLists.txt @@ -2,5 +2,6 @@ project(meta-language-model) add_subdirectory(tools) -add_library(meta-language-model language_model.cpp) +add_library(meta-language-model language_model.cpp + sentence.cpp) target_link_libraries(meta-language-model meta-corpus) diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index c35884c48..bef5cbdda 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -67,8 +67,7 @@ void language_model::select_method(const std::string& config_file) "language-model format could not be determined"}; } -language_model::language_model(const std::string& config_file, size_t n): - N_{n} +language_model::language_model(const std::string& config_file, size_t n) : N_{n} { if (N_ > 1) interp_ = make_unique(config_file, N_ - 1); @@ -93,7 +92,7 @@ void language_model::learn_model(const std::string& config_file) stream->set_content(doc.content()); // get ngram stream started - std::deque ngram; + sentence ngram; for (size_t i = 1; i < N_; ++i) ngram.push_back(""); @@ -103,7 +102,7 @@ void language_model::learn_model(const std::string& config_file) auto token = stream->next(); if (N_ > 1) { - ++dist_[make_string(ngram)][token]; + ++dist_[ngram.to_string()][token]; ngram.pop_front(); ngram.push_back(token); } @@ -133,7 +132,7 @@ void language_model::read_precomputed(const std::string& prefix) std::getline(in, line); std::istringstream iss{line}; iss >> count; - std::deque ngram; + sentence ngram; std::string token; for (size_t i = 0; i < N_ - 1; ++i) { @@ -145,11 +144,11 @@ void language_model::read_precomputed(const std::string& prefix) if (iss) { iss >> token; - dist_[make_string(ngram)][token] = count; + dist_[ngram.to_string()][token] = count; } else // if unigram { - dist_[""][make_string(ngram)] = count; + dist_[""][ngram.to_string()] = count; } } @@ -164,13 +163,13 @@ void language_model::read_precomputed(const std::string& prefix) } } -std::string language_model::next_token(const std::deque& tokens, +std::string language_model::next_token(const sentence& tokens, double random) const { - auto str = make_string(tokens); - auto it = dist_.find(str); + auto it = dist_.find(tokens.to_string()); if (it == dist_.end()) - throw std::runtime_error{"couldn't find previous n - 1 tokens: " + str}; + throw std::runtime_error{"couldn't find previous n - 1 tokens: " + + tokens.to_string()}; double cur = 0.0; for (auto& end : it->second) @@ -180,16 +179,17 @@ std::string language_model::next_token(const std::deque& tokens, return end.first; } - throw std::runtime_error{"could not generate next token: " + str}; + throw std::runtime_error{"could not generate next token: " + + tokens.to_string()}; } std::vector> - language_model::top_k(const std::deque& prev, size_t k) const + language_model::top_k(const sentence& prev, size_t k) const { if (prev.size() != N_ - 1) throw std::runtime_error{"prev should contain n - 1 tokens"}; - auto it = dist_.find(make_string(prev)); + auto it = dist_.find(prev.to_string()); if (it == dist_.end()) throw std::runtime_error{"no transitions found"}; @@ -219,7 +219,7 @@ std::string language_model::generate(unsigned int seed) const std::uniform_real_distribution rdist(0.0, 1.0); // start generating at the beginning of a sequence - std::deque ngram; + sentence ngram; for (size_t n = 1; n < N_; ++n) ngram.push_back(""); @@ -235,25 +235,23 @@ std::string language_model::generate(unsigned int seed) const next = next_token(ngram, rdist(gen)); } - output += make_string(ngram); + output += ngram.to_string(); return output; } -double language_model::prob(std::deque tokens) const +double language_model::prob(sentence tokens) const { if (tokens.size() != N_) throw std::runtime_error{"prob() needs one N-gram"}; - std::deque interp_tokens{tokens}; + sentence interp_tokens{tokens}; interp_tokens.pop_front(); // look at prev N - 1 auto interp_prob = interp_ ? interp_->prob(interp_tokens) : 1.0; auto last = tokens.back(); tokens.pop_back(); - auto ngram = make_string(tokens); - - auto endings = dist_.find(ngram); + auto endings = dist_.find(tokens.to_string()); if (endings == dist_.end()) return (1.0 - lambda_) * interp_prob; @@ -264,51 +262,26 @@ double language_model::prob(std::deque tokens) const return lambda_ * prob->second + (1.0 - lambda_) * interp_prob; } -double language_model::perplexity(const std::string& tokens) const +double language_model::perplexity(const sentence& tokens) const { - std::deque ngram; + sentence ngram; for (size_t i = 1; i < N_; ++i) ngram.push_back(""); double perp = 0.0; - for (auto& token : make_deque(tokens)) + for (auto& token : tokens) { ngram.push_back(token); - perp += std::log(1.0 + 1.0 / prob(ngram)); + perp += std::log(1.0 / prob(ngram)); ngram.pop_front(); } - return std::pow(perp, 1.0 / N_); + return perp / N_; } -double language_model::perplexity_per_word(const std::string& tokens) const +double language_model::perplexity_per_word(const sentence& tokens) const { return perplexity(tokens) / tokens.size(); } - -std::deque - language_model::make_deque(const std::string& tokens) const -{ - std::deque d; - std::stringstream sstream{tokens}; - std::string token; - while (sstream >> token) - d.push_back(token); - - return d; -} - -std::string - language_model::make_string(const std::deque& tokens) const -{ - std::string result{""}; - if (tokens.empty()) - return result; - - for (auto& token : tokens) - result += token + " "; - - return result.substr(0, result.size() - 1); // remove trailing space -} } } diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp new file mode 100644 index 000000000..2b1a8fc5f --- /dev/null +++ b/src/lm/sentence.cpp @@ -0,0 +1,88 @@ +/** + * @file sentence.cpp + * @author Sean Massung + */ + +#include "lm/sentence.h" +#include "analyzers/analyzer.h" +#include "analyzers/tokenizers/icu_tokenizer.h" +#include "analyzers/filters/lowercase_filter.h" +#include "analyzers/filters/alpha_filter.h" +#include "analyzers/filters/empty_sentence_filter.h" + +namespace meta +{ +namespace lm +{ +sentence::sentence(const std::string& text) +{ + using namespace analyzers; + std::unique_ptr stream; + stream = make_unique(); + stream = make_unique(std::move(stream)); + stream = make_unique(std::move(stream)); + stream = make_unique(std::move(stream)); + stream->set_content(text); + while (*stream) + tokens_.push_back(stream->next()); + + // remove sentence markers (they're inserted by the LM) + tokens_.pop_front(); + tokens_.pop_back(); +} + +std::string sentence::to_string() const +{ + std::string result{""}; + if (tokens_.empty()) + return result; + + for (auto& token : tokens_) + result += token + " "; + + return result.substr(0, result.size() - 1); // remove trailing space +} + +const std::string& sentence::operator[](size_type idx) const +{ + return tokens_[idx]; +} + +void sentence::substitute(size_type idx, const std::string& token) +{ + tokens_[idx] = token; +} + +void sentence::remove(size_type idx) { tokens_.erase(tokens_.begin() + idx); } + +void sentence::insert(size_type idx, const std::string& token) +{ + tokens_.insert(tokens_.begin() + idx, token); +} + +std::string sentence::front() const { return tokens_.front(); } + +std::string sentence::back() const { return tokens_.back(); } + +void sentence::push_front(const std::string& token) +{ + tokens_.push_front(token); +} + +void sentence::pop_front() { tokens_.pop_front(); } + +void sentence::push_back(const std::string& token) { tokens_.push_back(token); } + +void sentence::pop_back() { tokens_.pop_back(); } + +sentence::iterator sentence::begin() { return tokens_.begin(); } + +sentence::iterator sentence::end() { return tokens_.end(); } + +sentence::const_iterator sentence::begin() const { return tokens_.cbegin(); } + +sentence::const_iterator sentence::end() const { return tokens_.cend(); } + +sentence::size_type sentence::size() const { return tokens_.size(); } +} +} diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index 6cad71eb4..87e048fcc 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -6,17 +6,37 @@ #include #include "meta.h" #include "lm/language_model.h" +#include "lm/sentence.h" using namespace meta; +void measure_perplexity(lm::language_model& model, const lm::sentence& line) +{ + std::cout << "=======================================" << std::endl; + std::cout << " Sentence: " << line.to_string() << std::endl; + std::cout << "---------------------------------------" << std::endl; + std::cout << "Perplexity: " << model.perplexity(line) << std::endl; + std::cout << " Per word: " << model.perplexity_per_word(line) << std::endl; + std::cout << "=======================================" << std::endl; +} + int main(int argc, char* argv[]) { lm::language_model model{argv[1], 3}; - for (size_t i = 1; i < 10; ++i) + std::string line; + while (true) { - auto sentence = model.generate(i); - std::cout << sentence << std::endl; - std::cout << " -> perplexity_per_word: " - << model.perplexity_per_word(sentence) << std::endl; + std::cout << "> "; + std::getline(std::cin, line); + if (line.empty()) + break; + lm::sentence sent{line}; + measure_perplexity(model, line); + for (size_t i = 0; i < sent.size(); ++i) + { + lm::sentence cpy{sent}; + cpy.remove(i); + measure_perplexity(model, cpy); + } } } From 9fc1e05ad9e4393ba1889d4500089d7e70cbc984 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 29 Sep 2014 21:02:30 -0500 Subject: [PATCH 006/481] fix sentences by inserting and deleting words --- src/lm/tools/lm-test.cpp | 82 ++++++++++++++++++++++++++++++++++------ 1 file changed, 70 insertions(+), 12 deletions(-) diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index 87e048fcc..eb25f02d9 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -4,39 +4,97 @@ */ #include +#include #include "meta.h" +#include "cpptoml.h" #include "lm/language_model.h" #include "lm/sentence.h" using namespace meta; -void measure_perplexity(lm::language_model& model, const lm::sentence& line) +std::vector function_words(const std::string& config_file) { - std::cout << "=======================================" << std::endl; - std::cout << " Sentence: " << line.to_string() << std::endl; - std::cout << "---------------------------------------" << std::endl; - std::cout << "Perplexity: " << model.perplexity(line) << std::endl; - std::cout << " Per word: " << model.perplexity_per_word(line) << std::endl; - std::cout << "=======================================" << std::endl; + auto config = cpptoml::parse_file(config_file); + std::ifstream in{*config.get_as("function-words")}; + std::vector words; + std::string word; + while (in >> word) + words.push_back(word); + return words; +} + +template +void step(const lm::sentence& sent, const lm::language_model& model, + PQ& candidates, const std::vector& fwords, size_t depth) +{ + if (depth == 2) + return; + + for (size_t i = 0; i < sent.size(); ++i) + { + lm::sentence rem_cpy{sent}; + rem_cpy.remove(i); + candidates.emplace(rem_cpy, model.perplexity_per_word(rem_cpy)); + if (candidates.size() > 100) + candidates.pop(); + step(rem_cpy, model, candidates, fwords, depth + 1); + + for (auto& fw : fwords) + { + lm::sentence ins_cpy{sent}; + ins_cpy.insert(i, fw); + candidates.emplace(ins_cpy, model.perplexity_per_word(ins_cpy)); + if (candidates.size() > 100) + candidates.pop(); + step(ins_cpy, model, candidates, fwords, depth + 1); + } + } } int main(int argc, char* argv[]) { lm::language_model model{argv[1], 3}; std::string line; + using pair_t = std::pair; + auto fwords = function_words(argv[1]); + auto comp = [](const pair_t& a, const pair_t& b) + { + return a.second < b.second; + }; + while (true) { std::cout << "> "; std::getline(std::cin, line); if (line.empty()) break; + + std::priority_queue, decltype(comp)> + candidates{comp}; lm::sentence sent{line}; - measure_perplexity(model, line); - for (size_t i = 0; i < sent.size(); ++i) + + candidates.emplace(sent, model.perplexity_per_word(sent)); + step(sent, model, candidates, fwords, 0); + + std::cout << "Found " << candidates.size() << " candidates." + << std::endl; + + std::vector sorted; + while (!candidates.empty()) + { + sorted.push_back(candidates.top()); + candidates.pop(); + } + std::reverse(sorted.begin(), sorted.end()); + + for (size_t i = 0; i < 5; ++i) { - lm::sentence cpy{sent}; - cpy.remove(i); - measure_perplexity(model, cpy); + std::cout << "====================================" << std::endl; + std::cout << (i + 1) << "." << std::endl; + std::cout << " Sentence: " << sorted[i].first.to_string() + << std::endl; + std::cout << " PPW: " << sorted[i].second << std::endl; + std::cout << std::endl; } } } From d9e7a0ff55fdfc5636fa41bf9dbefaf395e8d9ee Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 16 Oct 2014 20:09:32 -0500 Subject: [PATCH 007/481] insert, remove, and substitute for lm --- data/function-words-short.txt | 100 ++++++++++++++++++++++++++++++++++ src/lm/tools/lm-test.cpp | 88 ++++++++++++++++++++++++++---- 2 files changed, 176 insertions(+), 12 deletions(-) create mode 100644 data/function-words-short.txt diff --git a/data/function-words-short.txt b/data/function-words-short.txt new file mode 100644 index 000000000..44650a51a --- /dev/null +++ b/data/function-words-short.txt @@ -0,0 +1,100 @@ +the +and +to +of +a +in +that +i +is +for +was +he +it +on +you +with +as +have +at +be +are +but +his +this +they +n't +not +we +from +she +by +had +an +do +her +or +what +has +were +one +their +who +all +would +if +when +my +about +can +more +so +there +will +out +been +up +just +like +did +which +your +into +could +some +people +new +no +how +think +other +than +because +get +our +going +its +me +these +two +them +over +said +him +only +then +also +after +time +even +back +most +very +first +now +know +where +make +many +go +may diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index eb25f02d9..2290ef0ae 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -4,11 +4,13 @@ */ #include +#include #include #include "meta.h" #include "cpptoml.h" #include "lm/language_model.h" #include "lm/sentence.h" +#include "porter2_stemmer.h" using namespace meta; @@ -23,30 +25,90 @@ std::vector function_words(const std::string& config_file) return words; } +std::unordered_map> + get_stems(const std::string& config_file) +{ + std::unordered_set vocab; + auto config = cpptoml::parse_file(config_file); + auto prefix = *config.get_as("prefix"); + auto dataset = *config.get_as("dataset"); + std::ifstream in{prefix + "/" + dataset + "/" + dataset + ".dat"}; + std::string token; + while (in >> token) + { + std::transform(token.begin(), token.end(), token.begin(), ::tolower); + vocab.insert(token); + } + + std::unordered_map> stems; + for (auto& t : vocab) + { + std::string stemmed{t}; + Porter2Stemmer::stem(stemmed); + stems[stemmed].push_back(t); + } + + return stems; +} + template -void step(const lm::sentence& sent, const lm::language_model& model, - PQ& candidates, const std::vector& fwords, size_t depth) +void + step(const lm::sentence& sent, const lm::language_model& model, + PQ& candidates, const std::vector& fwords, size_t depth, + std::unordered_set& seen, + const std::unordered_map>& stems) { - if (depth == 2) + if (depth == 2 || seen.find(sent.to_string()) != seen.end()) return; for (size_t i = 0; i < sent.size(); ++i) { + // remove + lm::sentence rem_cpy{sent}; rem_cpy.remove(i); - candidates.emplace(rem_cpy, model.perplexity_per_word(rem_cpy)); - if (candidates.size() > 100) - candidates.pop(); - step(rem_cpy, model, candidates, fwords, depth + 1); + if (seen.find(rem_cpy.to_string()) == seen.end()) + { + seen.insert(rem_cpy.to_string()); + candidates.emplace(rem_cpy, model.perplexity_per_word(rem_cpy)); + step(rem_cpy, model, candidates, fwords, depth + 1, seen, stems); + } + + // insert for (auto& fw : fwords) { lm::sentence ins_cpy{sent}; ins_cpy.insert(i, fw); - candidates.emplace(ins_cpy, model.perplexity_per_word(ins_cpy)); - if (candidates.size() > 100) - candidates.pop(); - step(ins_cpy, model, candidates, fwords, depth + 1); + if (seen.find(ins_cpy.to_string()) == seen.end()) + { + seen.insert(ins_cpy.to_string()); + candidates.emplace(ins_cpy, model.perplexity_per_word(ins_cpy)); + step(ins_cpy, model, candidates, fwords, depth + 1, seen, + stems); + } + } + + // substitute + + std::string stemmed = sent[i]; + Porter2Stemmer::stem(stemmed); + auto it = stems.find(stemmed); + if (it != stems.end() && it->second.size() != 1) + { + for (auto& stem : it->second) + { + lm::sentence subbed{sent}; + subbed.substitute(i, stem); + if (seen.find(subbed.to_string()) == seen.end()) + { + seen.insert(subbed.to_string()); + candidates.emplace(subbed, + model.perplexity_per_word(subbed)); + step(subbed, model, candidates, fwords, depth + 1, seen, + stems); + } + } } } } @@ -57,6 +119,7 @@ int main(int argc, char* argv[]) std::string line; using pair_t = std::pair; auto fwords = function_words(argv[1]); + auto stems = get_stems(argv[1]); auto comp = [](const pair_t& a, const pair_t& b) { return a.second < b.second; @@ -74,7 +137,8 @@ int main(int argc, char* argv[]) lm::sentence sent{line}; candidates.emplace(sent, model.perplexity_per_word(sent)); - step(sent, model, candidates, fwords, 0); + std::unordered_set seen; + step(sent, model, candidates, fwords, 0, seen, stems); std::cout << "Found " << candidates.size() << " candidates." << std::endl; From 30a13eebcc2a517e3182afc404d1399ac2a4478a Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 17 Oct 2014 16:30:34 -0500 Subject: [PATCH 008/481] move code from lm-test into its own class --- include/lm/diff.h | 62 ++++++++++++++++++ src/lm/CMakeLists.txt | 3 +- src/lm/diff.cpp | 132 ++++++++++++++++++++++++++++++++++++++ src/lm/tools/lm-test.cpp | 135 ++------------------------------------- 4 files changed, 201 insertions(+), 131 deletions(-) create mode 100644 include/lm/diff.h create mode 100644 src/lm/diff.cpp diff --git a/include/lm/diff.h b/include/lm/diff.h new file mode 100644 index 000000000..6c44c959c --- /dev/null +++ b/include/lm/diff.h @@ -0,0 +1,62 @@ +/** + * @file diff.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_LM_DIFF_H_ +#define META_LM_DIFF_H_ + +#include +#include +#include "lm/language_model.h" + +namespace meta +{ +namespace lm +{ +class diff +{ + public: + /** + * @param config_file The file containing configuration information + */ + diff(const std::string& config_file); + + /** + * @param sent The sentence to transform + * @return a sorted list of candidate corrections and their scores + */ + std::vector> candidates(const sentence& sent); + + private: + /** + * @param config_file The file containing configuration information + */ + void set_stems(const std::string& config_file); + + /** + * @param config_file The file containing configuration information + */ + void set_function_words(const std::string& config_file); + + /** + * @param sent + * @param candidates + * @param depth + */ + template + void step(const sentence& sent, PQ& candidates, size_t depth); + + language_model lm_; + std::vector fwords_; + std::unordered_map> stems_; + std::unordered_set seen_; +}; +} +} + +#endif diff --git a/src/lm/CMakeLists.txt b/src/lm/CMakeLists.txt index 7a943c4b2..3342f5f57 100644 --- a/src/lm/CMakeLists.txt +++ b/src/lm/CMakeLists.txt @@ -3,5 +3,6 @@ project(meta-language-model) add_subdirectory(tools) add_library(meta-language-model language_model.cpp + diff.cpp sentence.cpp) -target_link_libraries(meta-language-model meta-corpus) +target_link_libraries(meta-language-model meta-corpus porter2-stemmer) diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp new file mode 100644 index 000000000..dce60de8d --- /dev/null +++ b/src/lm/diff.cpp @@ -0,0 +1,132 @@ +/** + * @file diff.cpp + * @author Sean Massung + */ + +#include +#include +#include "lm/diff.h" +#include "cpptoml.h" +#include "porter2_stemmer.h" + +namespace meta +{ +namespace lm +{ +diff::diff(const std::string& config_file) : lm_{config_file} +{ + set_stems(config_file); + set_function_words(config_file); +} + +std::vector> diff::candidates(const sentence& sent) +{ + using pair_t = std::pair; + auto comp = [](const pair_t& a, const pair_t& b) + { + return a.second < b.second; + }; + std::priority_queue, decltype(comp)> candidates{ + comp}; + candidates.emplace(sent, lm_.perplexity_per_word(sent)); + + seen_.clear(); + step(sent, candidates, 0); + + std::vector sorted; + while (!candidates.empty()) + { + sorted.push_back(candidates.top()); + candidates.pop(); + } + std::reverse(sorted.begin(), sorted.end()); + return sorted; +} + +template +void diff::step(const sentence& sent, PQ& candidates, size_t depth) +{ + if (depth == 2 || seen_.find(sent.to_string()) != seen_.end()) + return; + + for (size_t i = 0; i < sent.size(); ++i) + { + // remove + + sentence rem_cpy{sent}; + rem_cpy.remove(i); + if (seen_.find(rem_cpy.to_string()) == seen_.end()) + { + seen_.insert(rem_cpy.to_string()); + candidates.emplace(rem_cpy, lm_.perplexity_per_word(rem_cpy)); + step(rem_cpy, candidates, depth + 1); + } + + // insert + + for (auto& fw : fwords_) + { + sentence ins_cpy{sent}; + ins_cpy.insert(i, fw); + if (seen_.find(ins_cpy.to_string()) == seen_.end()) + { + seen_.insert(ins_cpy.to_string()); + candidates.emplace(ins_cpy, lm_.perplexity_per_word(ins_cpy)); + step(ins_cpy, candidates, depth + 1); + } + } + + // substitute + + std::string stemmed = sent[i]; + Porter2Stemmer::stem(stemmed); + auto it = stems_.find(stemmed); + if (it != stems_.end() && it->second.size() != 1) + { + for (auto& stem : it->second) + { + sentence subbed{sent}; + subbed.substitute(i, stem); + if (seen_.find(subbed.to_string()) == seen_.end()) + { + seen_.insert(subbed.to_string()); + candidates.emplace(subbed, lm_.perplexity_per_word(subbed)); + step(subbed, candidates, depth + 1); + } + } + } + } +} + +void diff::set_function_words(const std::string& config_file) +{ + auto config = cpptoml::parse_file(config_file); + std::ifstream in{*config.get_as("function-words")}; + std::string word; + while (in >> word) + fwords_.push_back(word); +} + +void diff::set_stems(const std::string& config_file) +{ + std::unordered_set vocab; + auto config = cpptoml::parse_file(config_file); + auto prefix = *config.get_as("prefix"); + auto dataset = *config.get_as("dataset"); + std::ifstream in{prefix + "/" + dataset + "/" + dataset + ".dat"}; + std::string token; + while (in >> token) + { + std::transform(token.begin(), token.end(), token.begin(), ::tolower); + vocab.insert(token); + } + + for (auto& t : vocab) + { + std::string stemmed{t}; + Porter2Stemmer::stem(stemmed); + stems_[stemmed].push_back(t); + } +} +} +} diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index 2290ef0ae..cc793fbc6 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -4,127 +4,16 @@ */ #include -#include -#include #include "meta.h" -#include "cpptoml.h" -#include "lm/language_model.h" +#include "lm/diff.h" #include "lm/sentence.h" -#include "porter2_stemmer.h" using namespace meta; -std::vector function_words(const std::string& config_file) -{ - auto config = cpptoml::parse_file(config_file); - std::ifstream in{*config.get_as("function-words")}; - std::vector words; - std::string word; - while (in >> word) - words.push_back(word); - return words; -} - -std::unordered_map> - get_stems(const std::string& config_file) -{ - std::unordered_set vocab; - auto config = cpptoml::parse_file(config_file); - auto prefix = *config.get_as("prefix"); - auto dataset = *config.get_as("dataset"); - std::ifstream in{prefix + "/" + dataset + "/" + dataset + ".dat"}; - std::string token; - while (in >> token) - { - std::transform(token.begin(), token.end(), token.begin(), ::tolower); - vocab.insert(token); - } - - std::unordered_map> stems; - for (auto& t : vocab) - { - std::string stemmed{t}; - Porter2Stemmer::stem(stemmed); - stems[stemmed].push_back(t); - } - - return stems; -} - -template -void - step(const lm::sentence& sent, const lm::language_model& model, - PQ& candidates, const std::vector& fwords, size_t depth, - std::unordered_set& seen, - const std::unordered_map>& stems) -{ - if (depth == 2 || seen.find(sent.to_string()) != seen.end()) - return; - - for (size_t i = 0; i < sent.size(); ++i) - { - // remove - - lm::sentence rem_cpy{sent}; - rem_cpy.remove(i); - if (seen.find(rem_cpy.to_string()) == seen.end()) - { - seen.insert(rem_cpy.to_string()); - candidates.emplace(rem_cpy, model.perplexity_per_word(rem_cpy)); - step(rem_cpy, model, candidates, fwords, depth + 1, seen, stems); - } - - // insert - - for (auto& fw : fwords) - { - lm::sentence ins_cpy{sent}; - ins_cpy.insert(i, fw); - if (seen.find(ins_cpy.to_string()) == seen.end()) - { - seen.insert(ins_cpy.to_string()); - candidates.emplace(ins_cpy, model.perplexity_per_word(ins_cpy)); - step(ins_cpy, model, candidates, fwords, depth + 1, seen, - stems); - } - } - - // substitute - - std::string stemmed = sent[i]; - Porter2Stemmer::stem(stemmed); - auto it = stems.find(stemmed); - if (it != stems.end() && it->second.size() != 1) - { - for (auto& stem : it->second) - { - lm::sentence subbed{sent}; - subbed.substitute(i, stem); - if (seen.find(subbed.to_string()) == seen.end()) - { - seen.insert(subbed.to_string()); - candidates.emplace(subbed, - model.perplexity_per_word(subbed)); - step(subbed, model, candidates, fwords, depth + 1, seen, - stems); - } - } - } - } -} - int main(int argc, char* argv[]) { - lm::language_model model{argv[1], 3}; + lm::diff correcter{argv[1]}; std::string line; - using pair_t = std::pair; - auto fwords = function_words(argv[1]); - auto stems = get_stems(argv[1]); - auto comp = [](const pair_t& a, const pair_t& b) - { - return a.second < b.second; - }; - while (true) { std::cout << "> "; @@ -132,32 +21,18 @@ int main(int argc, char* argv[]) if (line.empty()) break; - std::priority_queue, decltype(comp)> - candidates{comp}; lm::sentence sent{line}; - - candidates.emplace(sent, model.perplexity_per_word(sent)); - std::unordered_set seen; - step(sent, model, candidates, fwords, 0, seen, stems); - + auto candidates = correcter.candidates(sent); std::cout << "Found " << candidates.size() << " candidates." << std::endl; - std::vector sorted; - while (!candidates.empty()) - { - sorted.push_back(candidates.top()); - candidates.pop(); - } - std::reverse(sorted.begin(), sorted.end()); - for (size_t i = 0; i < 5; ++i) { std::cout << "====================================" << std::endl; std::cout << (i + 1) << "." << std::endl; - std::cout << " Sentence: " << sorted[i].first.to_string() + std::cout << " Sentence: " << candidates[i].first.to_string() << std::endl; - std::cout << " PPW: " << sorted[i].second << std::endl; + std::cout << " PPW: " << candidates[i].second << std::endl; std::cout << std::endl; } } From 62e5ce93b24ef4848846b7b934fe37b7777c9425 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 17 Oct 2014 16:43:51 -0500 Subject: [PATCH 009/481] have sentences record their edits --- include/lm/sentence.h | 4 ++++ src/lm/diff.cpp | 2 +- src/lm/sentence.cpp | 14 +++++++++++++- src/lm/tools/lm-test.cpp | 5 ++++- 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/include/lm/sentence.h b/include/lm/sentence.h index 32ed89eb9..481313f63 100644 --- a/include/lm/sentence.h +++ b/include/lm/sentence.h @@ -7,6 +7,7 @@ #define META_SENTENCE_H_ #include +#include #include namespace meta @@ -62,6 +63,8 @@ class sentence */ void insert(size_type idx, const std::string& token); + const std::vector& operations() const; + std::string front() const; std::string back() const; @@ -101,6 +104,7 @@ class sentence private: std::deque tokens_; + std::vector ops_; }; } } diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index dce60de8d..c312f8e6c 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -46,7 +46,7 @@ std::vector> diff::candidates(const sentence& sent) template void diff::step(const sentence& sent, PQ& candidates, size_t depth) { - if (depth == 2 || seen_.find(sent.to_string()) != seen_.end()) + if (depth == 3 || seen_.find(sent.to_string()) != seen_.end()) return; for (size_t i = 0; i < sent.size(); ++i) diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 2b1a8fc5f..ef44e0dbb 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -50,14 +50,26 @@ const std::string& sentence::operator[](size_type idx) const void sentence::substitute(size_type idx, const std::string& token) { + ops_.push_back("substitute(" + std::to_string(idx) + ", " + tokens_[idx] + + " -> " + token + ")"); tokens_[idx] = token; } -void sentence::remove(size_type idx) { tokens_.erase(tokens_.begin() + idx); } +void sentence::remove(size_type idx) +{ + ops_.push_back("remove(" + std::to_string(idx) + ", " + (*this)[idx] + ")"); + tokens_.erase(tokens_.begin() + idx); +} void sentence::insert(size_type idx, const std::string& token) { tokens_.insert(tokens_.begin() + idx, token); + ops_.push_back("insert(" + std::to_string(idx) + ", " + token + ")"); +} + +const std::vector& sentence::operations() const +{ + return ops_; } std::string sentence::front() const { return tokens_.front(); } diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index cc793fbc6..9f815baf4 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -32,7 +32,10 @@ int main(int argc, char* argv[]) std::cout << (i + 1) << "." << std::endl; std::cout << " Sentence: " << candidates[i].first.to_string() << std::endl; - std::cout << " PPW: " << candidates[i].second << std::endl; + std::cout << " PPW: " << candidates[i].second << std::endl; + std::cout << " Edits:" << std::endl; + for(auto& e: candidates[i].first.operations()) + std::cout << " " << e << std::endl; std::cout << std::endl; } } From 471f02b9ae076b07b653df16a53f1ecafa4e8e44 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 17 Oct 2014 20:48:46 -0500 Subject: [PATCH 010/481] fix seen bug and add default max edit depth --- include/lm/diff.h | 5 ++++- src/lm/diff.cpp | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index 6c44c959c..574cfab0d 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -24,7 +24,8 @@ class diff /** * @param config_file The file containing configuration information */ - diff(const std::string& config_file); + diff(const std::string& config_file, + uint64_t max_depth = default_max_depth_); /** * @param sent The sentence to transform @@ -55,6 +56,8 @@ class diff std::vector fwords_; std::unordered_map> stems_; std::unordered_set seen_; + uint64_t max_depth_; + static constexpr uint64_t default_max_depth_ = 2; }; } } diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index c312f8e6c..63ddb2239 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -13,7 +13,8 @@ namespace meta { namespace lm { -diff::diff(const std::string& config_file) : lm_{config_file} +diff::diff(const std::string& config_file, uint64_t max_depth) + : lm_{config_file}, max_depth_{max_depth} { set_stems(config_file); set_function_words(config_file); @@ -46,7 +47,7 @@ std::vector> diff::candidates(const sentence& sent) template void diff::step(const sentence& sent, PQ& candidates, size_t depth) { - if (depth == 3 || seen_.find(sent.to_string()) != seen_.end()) + if (depth == max_depth_) return; for (size_t i = 0; i < sent.size(); ++i) From b8a98a2daf30f76db87ee3a6dd6480bcbf954c79 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 17 Oct 2014 21:09:12 -0500 Subject: [PATCH 011/481] make into more functions --- include/lm/diff.h | 30 ++++++++++++++++ src/lm/diff.cpp | 91 +++++++++++++++++++++++++++-------------------- 2 files changed, 82 insertions(+), 39 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index 574cfab0d..d60ede850 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -52,6 +52,36 @@ class diff template void step(const sentence& sent, PQ& candidates, size_t depth); + /** + * @param sent + * @param idx + * @param candidates + * @param depth + */ + template + void insert(const sentence& sent, size_t idx, PQ& candidates, + uint64_t depth); + + /** + * @param sent + * @param idx + * @param candidates + * @param depth + */ + template + void remove(const sentence& sent, size_t idx, PQ& candidates, + uint64_t depth); + + /** + * @param sent + * @param idx + * @param candidates + * @param depth + */ + template + void substitute(const sentence& sent, size_t idx, PQ& candidates, + uint64_t depth); + language_model lm_; std::vector fwords_; std::unordered_map> stems_; diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 63ddb2239..d233556bc 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -45,57 +45,70 @@ std::vector> diff::candidates(const sentence& sent) } template -void diff::step(const sentence& sent, PQ& candidates, size_t depth) +void diff::remove(const sentence& sent, size_t idx, PQ& candidates, + uint64_t depth) { - if (depth == max_depth_) - return; - - for (size_t i = 0; i < sent.size(); ++i) + sentence rem_cpy{sent}; + rem_cpy.remove(idx); + if (seen_.find(rem_cpy.to_string()) == seen_.end()) { - // remove + seen_.insert(rem_cpy.to_string()); + candidates.emplace(rem_cpy, lm_.perplexity_per_word(rem_cpy)); + step(rem_cpy, candidates, depth + 1); + } +} - sentence rem_cpy{sent}; - rem_cpy.remove(i); - if (seen_.find(rem_cpy.to_string()) == seen_.end()) +template +void diff::insert(const sentence& sent, size_t idx, PQ& candidates, + uint64_t depth) +{ + for (auto& fw : fwords_) + { + sentence ins_cpy{sent}; + ins_cpy.insert(idx, fw); + if (seen_.find(ins_cpy.to_string()) == seen_.end()) { - seen_.insert(rem_cpy.to_string()); - candidates.emplace(rem_cpy, lm_.perplexity_per_word(rem_cpy)); - step(rem_cpy, candidates, depth + 1); + seen_.insert(ins_cpy.to_string()); + candidates.emplace(ins_cpy, lm_.perplexity_per_word(ins_cpy)); + step(ins_cpy, candidates, depth + 1); } + } +} - // insert - - for (auto& fw : fwords_) +template +void diff::substitute(const sentence& sent, size_t idx, PQ& candidates, + uint64_t depth) +{ + std::string stemmed{sent[idx]}; + Porter2Stemmer::stem(stemmed); + auto it = stems_.find(stemmed); + if (it != stems_.end() && it->second.size() != 1) + { + for (auto& stem : it->second) { - sentence ins_cpy{sent}; - ins_cpy.insert(i, fw); - if (seen_.find(ins_cpy.to_string()) == seen_.end()) + sentence subbed{sent}; + subbed.substitute(idx, stem); + if (seen_.find(subbed.to_string()) == seen_.end()) { - seen_.insert(ins_cpy.to_string()); - candidates.emplace(ins_cpy, lm_.perplexity_per_word(ins_cpy)); - step(ins_cpy, candidates, depth + 1); + seen_.insert(subbed.to_string()); + candidates.emplace(subbed, lm_.perplexity_per_word(subbed)); + step(subbed, candidates, depth + 1); } } + } +} - // substitute +template +void diff::step(const sentence& sent, PQ& candidates, size_t depth) +{ + if (depth == max_depth_) + return; - std::string stemmed = sent[i]; - Porter2Stemmer::stem(stemmed); - auto it = stems_.find(stemmed); - if (it != stems_.end() && it->second.size() != 1) - { - for (auto& stem : it->second) - { - sentence subbed{sent}; - subbed.substitute(i, stem); - if (seen_.find(subbed.to_string()) == seen_.end()) - { - seen_.insert(subbed.to_string()); - candidates.emplace(subbed, lm_.perplexity_per_word(subbed)); - step(subbed, candidates, depth + 1); - } - } - } + for (size_t i = 0; i < sent.size(); ++i) + { + remove(sent, i, candidates, depth); + insert(sent, i, candidates, depth); + substitute(sent, i, candidates, depth); } } From d6333778676675deb0a9a76ffe80b16c2423bc04 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 21 Oct 2014 10:01:15 -0500 Subject: [PATCH 012/481] language_model_exception --- include/lm/language_model.h | 9 +++++++-- src/lm/language_model.cpp | 23 ++++++++++++----------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index c2d969de2..1ab16c061 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -76,8 +76,8 @@ class language_model * @param k Number of results to return * @return a sorted vector of likely next tokens */ - std::vector> - top_k(const sentence& prev, size_t k) const; + std::vector> top_k(const sentence& prev, + size_t k) const; private: /** @@ -110,6 +110,11 @@ class language_model /// The interpolation coefficient for smoothing LM probabilities constexpr static double lambda_ = 0.7; }; + +class language_model_exception : public std::runtime_error +{ + using std::runtime_error::runtime_error; +}; } } diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index bef5cbdda..eb85a5af1 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -31,7 +31,7 @@ language_model::language_model(const std::string& config_file) auto group = config.get_group("language-model"); auto nval = group->get_as("n-value"); if (!nval) - throw std::runtime_error{ + throw language_model_exception{ "no n-value specified in language-model group"}; N_ = *nval; @@ -50,20 +50,21 @@ void language_model::select_method(const std::string& config_file) auto group = config.get_group("language-model"); auto format = group->get_as("format"); if (!format) - throw std::runtime_error{"no format specified in language-model group"}; + throw language_model_exception{ + "no format specified in language-model group"}; if (*format == "precomputed") { auto prefix = group->get_as("prefix"); if (!prefix) - throw std::runtime_error{ + throw language_model_exception{ "no prefix specified for precomputed language model"}; read_precomputed(*prefix); } else if (*format == "learn") learn_model(config_file); else - throw std::runtime_error{ + throw language_model_exception{ "language-model format could not be determined"}; } @@ -168,8 +169,8 @@ std::string language_model::next_token(const sentence& tokens, { auto it = dist_.find(tokens.to_string()); if (it == dist_.end()) - throw std::runtime_error{"couldn't find previous n - 1 tokens: " - + tokens.to_string()}; + throw language_model_exception{"couldn't find previous n - 1 tokens: " + + tokens.to_string()}; double cur = 0.0; for (auto& end : it->second) @@ -179,19 +180,19 @@ std::string language_model::next_token(const sentence& tokens, return end.first; } - throw std::runtime_error{"could not generate next token: " - + tokens.to_string()}; + throw language_model_exception{"could not generate next token: " + + tokens.to_string()}; } std::vector> language_model::top_k(const sentence& prev, size_t k) const { if (prev.size() != N_ - 1) - throw std::runtime_error{"prev should contain n - 1 tokens"}; + throw language_model_exception{"prev should contain n - 1 tokens"}; auto it = dist_.find(prev.to_string()); if (it == dist_.end()) - throw std::runtime_error{"no transitions found"}; + throw language_model_exception{"no transitions found"}; using pair_t = std::pair; std::vector probs{it->second.begin(), it->second.end()}; @@ -242,7 +243,7 @@ std::string language_model::generate(unsigned int seed) const double language_model::prob(sentence tokens) const { if (tokens.size() != N_) - throw std::runtime_error{"prob() needs one N-gram"}; + throw language_model_exception{"prob() needs one N-gram"}; sentence interp_tokens{tokens}; interp_tokens.pop_front(); // look at prev N - 1 From 30d0452d1535488decfe2a0cc497e593df963784 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 21 Oct 2014 10:01:51 -0500 Subject: [PATCH 013/481] splice operator() for sentence and sentence_exception --- include/lm/sentence.h | 7 +++++++ src/lm/sentence.cpp | 17 +++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/include/lm/sentence.h b/include/lm/sentence.h index 481313f63..64edd01c7 100644 --- a/include/lm/sentence.h +++ b/include/lm/sentence.h @@ -44,6 +44,8 @@ class sentence */ const std::string& operator[](size_type idx) const; + sentence operator()(size_type from, size_type to) const; + /** * @param idx * @param token @@ -106,6 +108,11 @@ class sentence std::deque tokens_; std::vector ops_; }; + +class sentence_exception : public std::runtime_error +{ + using std::runtime_error::runtime_error; +}; } } diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index ef44e0dbb..6ce20a06c 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -48,6 +48,18 @@ const std::string& sentence::operator[](size_type idx) const return tokens_[idx]; } +sentence sentence::operator()(size_type from, size_type to) const +{ + sentence ret; + if (from > to || to > tokens_.size()) + throw sentence_exception{"operator() out of bounds: from = " + + std::to_string(from) + ", to = " + + std::to_string(to)}; + ret.tokens_.insert(ret.tokens_.begin(), tokens_.begin() + from, + tokens_.begin() + to); + return ret; +} + void sentence::substitute(size_type idx, const std::string& token) { ops_.push_back("substitute(" + std::to_string(idx) + ", " + tokens_[idx] @@ -67,10 +79,7 @@ void sentence::insert(size_type idx, const std::string& token) ops_.push_back("insert(" + std::to_string(idx) + ", " + token + ")"); } -const std::vector& sentence::operations() const -{ - return ops_; -} +const std::vector& sentence::operations() const { return ops_; } std::string sentence::front() const { return tokens_.front(); } From 589b014d332d14ab99a2ee8f58eb05cc9ec55c93 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 21 Oct 2014 10:02:07 -0500 Subject: [PATCH 014/481] lm operations to lm diff --- include/lm/diff.h | 23 +++++++++++++++- src/lm/diff.cpp | 58 ++++++++++++++++++++++++++++++++++++++-- src/lm/tools/lm-test.cpp | 2 +- 3 files changed, 79 insertions(+), 4 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index d60ede850..7b73d20b9 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -29,9 +29,11 @@ class diff /** * @param sent The sentence to transform + * @param use_lm * @return a sorted list of candidate corrections and their scores */ - std::vector> candidates(const sentence& sent); + std::vector> candidates(const sentence& sent, + bool use_lm = true); private: /** @@ -52,6 +54,14 @@ class diff template void step(const sentence& sent, PQ& candidates, size_t depth); + /** + * @param sent + * @param candidates + * @param depth + */ + template + void step_lm(const sentence& sent, PQ& candidates, size_t depth); + /** * @param sent * @param idx @@ -62,6 +72,16 @@ class diff void insert(const sentence& sent, size_t idx, PQ& candidates, uint64_t depth); + /** + * @param sent + * @param idx + * @param candidates + * @param depth + * @param substitute + */ + template + void lm_ops(const sentence& sent, size_t idx, PQ& candidates, + uint64_t depth, bool substitute); /** * @param sent * @param idx @@ -88,6 +108,7 @@ class diff std::unordered_set seen_; uint64_t max_depth_; static constexpr uint64_t default_max_depth_ = 2; + static constexpr uint64_t n_val_ = 3; }; } } diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index d233556bc..238900042 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -20,7 +20,8 @@ diff::diff(const std::string& config_file, uint64_t max_depth) set_function_words(config_file); } -std::vector> diff::candidates(const sentence& sent) +std::vector> + diff::candidates(const sentence& sent, bool use_lm /* = false */) { using pair_t = std::pair; auto comp = [](const pair_t& a, const pair_t& b) @@ -32,7 +33,10 @@ std::vector> diff::candidates(const sentence& sent) candidates.emplace(sent, lm_.perplexity_per_word(sent)); seen_.clear(); - step(sent, candidates, 0); + if (use_lm) + step_lm(sent, candidates, 0); + else + step(sent, candidates, 0); std::vector sorted; while (!candidates.empty()) @@ -58,6 +62,40 @@ void diff::remove(const sentence& sent, size_t idx, PQ& candidates, } } +template +void diff::lm_ops(const sentence& sent, size_t idx, PQ& candidates, + uint64_t depth, bool substitute) +{ + if (idx < n_val_ - 1) + return; + + auto spliced = sent(idx - (n_val_ - 1), idx); + try + { + auto best = lm_.top_k(spliced, 5); + for (auto& p : best) + { + if (p.first == "") + continue; + sentence ins_cpy{sent}; + if (substitute) + ins_cpy.insert(idx, p.first); + else + ins_cpy.substitute(idx, p.first); + if (seen_.find(ins_cpy.to_string()) == seen_.end()) + { + seen_.insert(ins_cpy.to_string()); + candidates.emplace(ins_cpy, lm_.perplexity_per_word(ins_cpy)); + step_lm(ins_cpy, candidates, depth + 1); + } + } + } + catch (language_model_exception& ex) + { + // ignore if there are no transitions found + } +} + template void diff::insert(const sentence& sent, size_t idx, PQ& candidates, uint64_t depth) @@ -112,6 +150,22 @@ void diff::step(const sentence& sent, PQ& candidates, size_t depth) } } +template +void diff::step_lm(const sentence& sent, PQ& candidates, size_t depth) +{ + if (depth == max_depth_) + return; + + for (size_t i = 0; i <= sent.size(); ++i) + { + remove(sent, i, candidates, depth); + insert(sent, i, candidates, depth); + lm_ops(sent, i, candidates, depth, true); + lm_ops(sent, i, candidates, depth, false); + substitute(sent, i, candidates, depth); + } +} + void diff::set_function_words(const std::string& config_file) { auto config = cpptoml::parse_file(config_file); diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index 9f815baf4..9087aff8e 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -22,7 +22,7 @@ int main(int argc, char* argv[]) break; lm::sentence sent{line}; - auto candidates = correcter.candidates(sent); + auto candidates = correcter.candidates(sent, true); std::cout << "Found " << candidates.size() << " candidates." << std::endl; From 1cb5adc4c8956bec4f30ef1e9f38371bfbe05d97 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 23 Oct 2014 11:43:04 -0500 Subject: [PATCH 015/481] max PQ size and add function --- include/lm/diff.h | 8 ++++++++ src/lm/diff.cpp | 35 +++++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index 7b73d20b9..8b2c1d61c 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -102,6 +102,13 @@ class diff void substitute(const sentence& sent, size_t idx, PQ& candidates, uint64_t depth); + /** + * @param candidates + * @param sent + */ + template + void add(PQ& candidates, sentence& sent); + language_model lm_; std::vector fwords_; std::unordered_map> stems_; @@ -109,6 +116,7 @@ class diff uint64_t max_depth_; static constexpr uint64_t default_max_depth_ = 2; static constexpr uint64_t n_val_ = 3; + static constexpr uint64_t max_cand_size_ = 100; }; } } diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 238900042..c0210a02b 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -3,6 +3,8 @@ * @author Sean Massung */ +#include + #include #include #include "lm/diff.h" @@ -41,13 +43,22 @@ std::vector> std::vector sorted; while (!candidates.empty()) { - sorted.push_back(candidates.top()); + sorted.emplace_back(std::move(candidates.top())); candidates.pop(); } std::reverse(sorted.begin(), sorted.end()); return sorted; } +template +void diff::add(PQ& candidates, sentence& sent) +{ + seen_.insert(sent.to_string()); + candidates.emplace(sent, lm_.perplexity_per_word(sent)); + if (candidates.size() > max_cand_size_) + candidates.pop(); +} + template void diff::remove(const sentence& sent, size_t idx, PQ& candidates, uint64_t depth) @@ -56,8 +67,7 @@ void diff::remove(const sentence& sent, size_t idx, PQ& candidates, rem_cpy.remove(idx); if (seen_.find(rem_cpy.to_string()) == seen_.end()) { - seen_.insert(rem_cpy.to_string()); - candidates.emplace(rem_cpy, lm_.perplexity_per_word(rem_cpy)); + add(candidates, rem_cpy); step(rem_cpy, candidates, depth + 1); } } @@ -77,16 +87,15 @@ void diff::lm_ops(const sentence& sent, size_t idx, PQ& candidates, { if (p.first == "") continue; - sentence ins_cpy{sent}; + sentence cpy{sent}; if (substitute) - ins_cpy.insert(idx, p.first); + cpy.insert(idx, p.first); else - ins_cpy.substitute(idx, p.first); - if (seen_.find(ins_cpy.to_string()) == seen_.end()) + cpy.substitute(idx, p.first); + if (seen_.find(cpy.to_string()) == seen_.end()) { - seen_.insert(ins_cpy.to_string()); - candidates.emplace(ins_cpy, lm_.perplexity_per_word(ins_cpy)); - step_lm(ins_cpy, candidates, depth + 1); + add(candidates, cpy); + step_lm(cpy, candidates, depth + 1); } } } @@ -106,8 +115,7 @@ void diff::insert(const sentence& sent, size_t idx, PQ& candidates, ins_cpy.insert(idx, fw); if (seen_.find(ins_cpy.to_string()) == seen_.end()) { - seen_.insert(ins_cpy.to_string()); - candidates.emplace(ins_cpy, lm_.perplexity_per_word(ins_cpy)); + add(candidates, ins_cpy); step(ins_cpy, candidates, depth + 1); } } @@ -128,8 +136,7 @@ void diff::substitute(const sentence& sent, size_t idx, PQ& candidates, subbed.substitute(idx, stem); if (seen_.find(subbed.to_string()) == seen_.end()) { - seen_.insert(subbed.to_string()); - candidates.emplace(subbed, lm_.perplexity_per_word(subbed)); + add(candidates, subbed); step(subbed, candidates, depth + 1); } } From 4576c3875b51f205c7af4914e6e6faa2e567bc58 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 23 Oct 2014 12:18:06 -0500 Subject: [PATCH 016/481] fix bug where LM operations were not always called --- include/lm/diff.h | 9 +-------- src/lm/diff.cpp | 31 ++++++++----------------------- 2 files changed, 9 insertions(+), 31 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index 8b2c1d61c..906143ba1 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -54,14 +54,6 @@ class diff template void step(const sentence& sent, PQ& candidates, size_t depth); - /** - * @param sent - * @param candidates - * @param depth - */ - template - void step_lm(const sentence& sent, PQ& candidates, size_t depth); - /** * @param sent * @param idx @@ -114,6 +106,7 @@ class diff std::unordered_map> stems_; std::unordered_set seen_; uint64_t max_depth_; + bool use_lm_; static constexpr uint64_t default_max_depth_ = 2; static constexpr uint64_t n_val_ = 3; static constexpr uint64_t max_cand_size_ = 100; diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index c0210a02b..3154169cd 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -3,8 +3,6 @@ * @author Sean Massung */ -#include - #include #include #include "lm/diff.h" @@ -25,6 +23,7 @@ diff::diff(const std::string& config_file, uint64_t max_depth) std::vector> diff::candidates(const sentence& sent, bool use_lm /* = false */) { + use_lm_ = use_lm; using pair_t = std::pair; auto comp = [](const pair_t& a, const pair_t& b) { @@ -35,10 +34,7 @@ std::vector> candidates.emplace(sent, lm_.perplexity_per_word(sent)); seen_.clear(); - if (use_lm) - step_lm(sent, candidates, 0); - else - step(sent, candidates, 0); + step(sent, candidates, 0); std::vector sorted; while (!candidates.empty()) @@ -95,7 +91,7 @@ void diff::lm_ops(const sentence& sent, size_t idx, PQ& candidates, if (seen_.find(cpy.to_string()) == seen_.end()) { add(candidates, cpy); - step_lm(cpy, candidates, depth + 1); + step(cpy, candidates, depth + 1); } } } @@ -153,22 +149,11 @@ void diff::step(const sentence& sent, PQ& candidates, size_t depth) { remove(sent, i, candidates, depth); insert(sent, i, candidates, depth); - substitute(sent, i, candidates, depth); - } -} - -template -void diff::step_lm(const sentence& sent, PQ& candidates, size_t depth) -{ - if (depth == max_depth_) - return; - - for (size_t i = 0; i <= sent.size(); ++i) - { - remove(sent, i, candidates, depth); - insert(sent, i, candidates, depth); - lm_ops(sent, i, candidates, depth, true); - lm_ops(sent, i, candidates, depth, false); + if (use_lm_) + { + lm_ops(sent, i, candidates, depth, true); + lm_ops(sent, i, candidates, depth, false); + } substitute(sent, i, candidates, depth); } } From 3ee547f5e7b1856afc6f094ca4e72575b576f917 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 6 Nov 2014 16:37:52 -0600 Subject: [PATCH 017/481] executable to generate dataset based on LM edits --- src/lm/tools/CMakeLists.txt | 3 +++ src/lm/tools/create-dataset.cpp | 30 ++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 src/lm/tools/create-dataset.cpp diff --git a/src/lm/tools/CMakeLists.txt b/src/lm/tools/CMakeLists.txt index c331e5700..81c5d018c 100644 --- a/src/lm/tools/CMakeLists.txt +++ b/src/lm/tools/CMakeLists.txt @@ -1,2 +1,5 @@ add_executable(lm-test lm-test.cpp) target_link_libraries(lm-test meta-language-model meta-index) + +add_executable(create-dataset create-dataset.cpp) +target_link_libraries(create-dataset meta-language-model meta-index) diff --git a/src/lm/tools/create-dataset.cpp b/src/lm/tools/create-dataset.cpp new file mode 100644 index 000000000..ce60bd754 --- /dev/null +++ b/src/lm/tools/create-dataset.cpp @@ -0,0 +1,30 @@ +/** + * @file create-dataset.cpp + * @author Sean Massung + */ + +#include +#include +#include "meta.h" +#include "lm/diff.h" +#include "lm/sentence.h" + +using namespace meta; + +int main(int argc, char* argv[]) +{ + lm::diff correcter{argv[1]}; + std::string line; + std::ifstream in{argv[2]}; + while (in) + { + std::getline(in, line); + if(line.empty()) + continue; + lm::sentence sent{line}; + auto candidates = correcter.candidates(sent, false); + std::cout << candidates[0].first.to_string() << std::endl; + for (auto& e : candidates[0].first.operations()) + std::cout << " " << e << std::endl; + } +} From 4aa28458b9dc895b31ecee483fa55e7d03418ee8 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 6 Nov 2014 16:39:29 -0600 Subject: [PATCH 018/481] make LM generation output clearer (commented out sections work with precomputed LM datasets with no beginning and end sentence markers) --- src/lm/language_model.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index eb85a5af1..7a1121b48 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -44,8 +44,6 @@ language_model::language_model(const std::string& config_file) void language_model::select_method(const std::string& config_file) { - std::cout << "Creating " << N_ << "-gram language model" << std::endl; - auto config = cpptoml::parse_file(config_file); auto group = config.get_group("language-model"); auto format = group->get_as("format"); @@ -78,6 +76,7 @@ language_model::language_model(const std::string& config_file, size_t n) : N_{n} void language_model::learn_model(const std::string& config_file) { + std::cout << "Learning " << N_ << "-gram language model" << std::endl; auto corpus = corpus::corpus::load(config_file); using namespace analyzers; @@ -125,6 +124,7 @@ void language_model::learn_model(const std::string& config_file) void language_model::read_precomputed(const std::string& prefix) { + std::cout << "Reading " << N_ << "-gram language model" << std::endl; std::ifstream in{prefix + std::to_string(N_) + "-grams.txt"}; std::string line; uint64_t count; @@ -236,7 +236,7 @@ std::string language_model::generate(unsigned int seed) const next = next_token(ngram, rdist(gen)); } - output += ngram.to_string(); + output += " " + ngram.to_string(); return output; } @@ -267,12 +267,15 @@ double language_model::perplexity(const sentence& tokens) const { sentence ngram; for (size_t i = 1; i < N_; ++i) + // ngram.push_back(tokens[i - 1]); ngram.push_back(""); double perp = 0.0; for (auto& token : tokens) + // for (size_t i = N_; i < tokens.size(); ++i) { ngram.push_back(token); + // ngram.push_back(tokens[i]); perp += std::log(1.0 / prob(ngram)); ngram.pop_front(); } From 67853a411500f266c43cb0965cd85ab29f9a173b Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 6 Nov 2014 18:21:14 -0600 Subject: [PATCH 019/481] LM operations only edit the least likely word each iteration --- include/lm/diff.h | 7 ++--- src/lm/diff.cpp | 68 ++++++++++++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 29 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index 906143ba1..1ee5749b6 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -66,14 +66,11 @@ class diff /** * @param sent - * @param idx * @param candidates * @param depth - * @param substitute */ template - void lm_ops(const sentence& sent, size_t idx, PQ& candidates, - uint64_t depth, bool substitute); + void lm_ops(const sentence& sent, PQ& candidates, uint64_t depth); /** * @param sent * @param idx @@ -107,7 +104,7 @@ class diff std::unordered_set seen_; uint64_t max_depth_; bool use_lm_; - static constexpr uint64_t default_max_depth_ = 2; + static constexpr uint64_t default_max_depth_ = 4; static constexpr uint64_t n_val_ = 3; static constexpr uint64_t max_cand_size_ = 100; }; diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 3154169cd..04e35933b 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -3,6 +3,8 @@ * @author Sean Massung */ +#include + #include #include #include "lm/diff.h" @@ -69,30 +71,48 @@ void diff::remove(const sentence& sent, size_t idx, PQ& candidates, } template -void diff::lm_ops(const sentence& sent, size_t idx, PQ& candidates, - uint64_t depth, bool substitute) +void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) { - if (idx < n_val_ - 1) + if (sent.size() < n_val_) return; - auto spliced = sent(idx - (n_val_ - 1), idx); + double min_prob = 1; + uint64_t best_idx = 0; + sentence best; + for (uint64_t i = n_val_ - 1; i < sent.size(); ++i) + { + auto ngram = sent(i - (n_val_ - 1), i + 1); + auto prob = lm_.prob(ngram); + if (prob < min_prob) + { + min_prob = prob; + best_idx = i; + best = ngram; + } + } + + sentence rem_cpy{sent}; + rem_cpy.remove(best_idx); + add(candidates, rem_cpy); + step(rem_cpy, candidates, depth + 1); + + best.pop_back(); try { - auto best = lm_.top_k(spliced, 5); - for (auto& p : best) + for (auto& next : lm_.top_k(best, 5)) { - if (p.first == "") + if (next.first == "") continue; - sentence cpy{sent}; - if (substitute) - cpy.insert(idx, p.first); - else - cpy.substitute(idx, p.first); - if (seen_.find(cpy.to_string()) == seen_.end()) - { - add(candidates, cpy); - step(cpy, candidates, depth + 1); - } + + sentence ins_cpy{sent}; + ins_cpy.insert(best_idx, next.first); + add(candidates, ins_cpy); + step(ins_cpy, candidates, depth + 1); + + sentence sub_cpy{sent}; + sub_cpy.substitute(best_idx, next.first); + add(candidates, sub_cpy); + step(sub_cpy, candidates, depth + 1); } } catch (language_model_exception& ex) @@ -145,16 +165,16 @@ void diff::step(const sentence& sent, PQ& candidates, size_t depth) if (depth == max_depth_) return; - for (size_t i = 0; i < sent.size(); ++i) + if (use_lm_) + lm_ops(sent, candidates, depth); + else { - remove(sent, i, candidates, depth); - insert(sent, i, candidates, depth); - if (use_lm_) + for (size_t i = 0; i < sent.size(); ++i) { - lm_ops(sent, i, candidates, depth, true); - lm_ops(sent, i, candidates, depth, false); + remove(sent, i, candidates, depth); + insert(sent, i, candidates, depth); + substitute(sent, i, candidates, depth); } - substitute(sent, i, candidates, depth); } } From 4de43a1f64b0f316986b38ae9f2a953095106cfc Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 10 Nov 2014 12:02:31 -0600 Subject: [PATCH 020/481] don't add duplicate sentences --- src/lm/diff.cpp | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 04e35933b..e182a3a7f 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -93,8 +93,11 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) sentence rem_cpy{sent}; rem_cpy.remove(best_idx); - add(candidates, rem_cpy); - step(rem_cpy, candidates, depth + 1); + if (seen_.find(rem_cpy.to_string()) == seen_.end()) + { + add(candidates, rem_cpy); + step(rem_cpy, candidates, depth + 1); + } best.pop_back(); try @@ -106,13 +109,21 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) sentence ins_cpy{sent}; ins_cpy.insert(best_idx, next.first); - add(candidates, ins_cpy); - step(ins_cpy, candidates, depth + 1); + + if (seen_.find(ins_cpy.to_string()) == seen_.end()) + { + add(candidates, ins_cpy); + step(ins_cpy, candidates, depth + 1); + } sentence sub_cpy{sent}; sub_cpy.substitute(best_idx, next.first); - add(candidates, sub_cpy); - step(sub_cpy, candidates, depth + 1); + + if (seen_.find(sub_cpy.to_string()) == seen_.end()) + { + add(candidates, sub_cpy); + step(sub_cpy, candidates, depth + 1); + } } } catch (language_model_exception& ex) From e43ec71bd46e5bbdb7b50a0cf46d441f7664d764 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 10 Nov 2014 12:15:46 -0600 Subject: [PATCH 021/481] weights for sentence edits --- include/lm/sentence.h | 24 +++++++++++++++++++++--- src/lm/sentence.cpp | 20 +++++++++++++++++--- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/include/lm/sentence.h b/include/lm/sentence.h index 64edd01c7..b0a5c1ae7 100644 --- a/include/lm/sentence.h +++ b/include/lm/sentence.h @@ -49,22 +49,39 @@ class sentence /** * @param idx * @param token + * @param weight The weight that this edit carries * @return replace the token at the specified index with the provided token */ - void substitute(size_type idx, const std::string& token); + void substitute(size_type idx, const std::string& token, + double weight = 0.0); /** * @param idx Index of the token to remove from this sentence + * @param weight The weight that this edit carries */ - void remove(size_type idx); + void remove(size_type idx, double weight = 0.0); /** * @param idx Index to insert a token in front of (to insert at beginning, * idx = 0) * @param token + * @param weight The weight that this edit carries */ - void insert(size_type idx, const std::string& token); + void insert(size_type idx, const std::string& token, double weight = 0.0); + /** + * @return the average weight of edits to this sentence + */ + double average_weight() const; + + /** + * @return the sequence of edit weights to this sentence + */ + std::vector weights() const; + + /** + * @return the operations (edits) performed on this sentence + */ const std::vector& operations() const; std::string front() const; @@ -107,6 +124,7 @@ class sentence private: std::deque tokens_; std::vector ops_; + std::vector weights_; }; class sentence_exception : public std::runtime_error diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 6ce20a06c..c890313b6 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -3,6 +3,7 @@ * @author Sean Massung */ +#include #include "lm/sentence.h" #include "analyzers/analyzer.h" #include "analyzers/tokenizers/icu_tokenizer.h" @@ -60,25 +61,38 @@ sentence sentence::operator()(size_type from, size_type to) const return ret; } -void sentence::substitute(size_type idx, const std::string& token) +void sentence::substitute(size_type idx, const std::string& token, + double weight /* = 0.0 */) { ops_.push_back("substitute(" + std::to_string(idx) + ", " + tokens_[idx] + " -> " + token + ")"); tokens_[idx] = token; + weights_.push_back(weight); } -void sentence::remove(size_type idx) +void sentence::remove(size_type idx, double weight /* = 0.0 */) { ops_.push_back("remove(" + std::to_string(idx) + ", " + (*this)[idx] + ")"); tokens_.erase(tokens_.begin() + idx); + weights_.push_back(weight); } -void sentence::insert(size_type idx, const std::string& token) +void sentence::insert(size_type idx, const std::string& token, + double weight /* = 0.0 */) { tokens_.insert(tokens_.begin() + idx, token); ops_.push_back("insert(" + std::to_string(idx) + ", " + token + ")"); + weights_.push_back(weight); } +double sentence::average_weight() const +{ + double sum = std::accumulate(weights_.begin(), weights_.end(), 0.0); + return sum / weights_.size(); +} + +std::vector sentence::weights() const { return weights_; } + const std::vector& sentence::operations() const { return ops_; } std::string sentence::front() const { return tokens_.front(); } From 4ef15ae35f7dc764e5af7a5c3fac4aaf74d51e50 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 10 Nov 2014 13:44:58 -0600 Subject: [PATCH 022/481] scoring for diff is combination of perplexity and weighted edits --- include/lm/diff.h | 7 +++++-- src/lm/diff.cpp | 8 +++++--- src/lm/tools/lm-test.cpp | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index 1ee5749b6..78f16edfc 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -96,7 +96,7 @@ class diff * @param sent */ template - void add(PQ& candidates, sentence& sent); + void add(PQ& candidates, const sentence& sent); language_model lm_; std::vector fwords_; @@ -104,9 +104,12 @@ class diff std::unordered_set seen_; uint64_t max_depth_; bool use_lm_; - static constexpr uint64_t default_max_depth_ = 4; + static constexpr uint64_t default_max_depth_ = 3; static constexpr uint64_t n_val_ = 3; static constexpr uint64_t max_cand_size_ = 100; + + /// balance between perplexity and edit weights + static constexpr double lambda_ = 0.5; }; } } diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index e182a3a7f..648a53b31 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -33,7 +33,7 @@ std::vector> }; std::priority_queue, decltype(comp)> candidates{ comp}; - candidates.emplace(sent, lm_.perplexity_per_word(sent)); + add(candidates, sent); seen_.clear(); step(sent, candidates, 0); @@ -49,10 +49,12 @@ std::vector> } template -void diff::add(PQ& candidates, sentence& sent) +void diff::add(PQ& candidates, const sentence& sent) { seen_.insert(sent.to_string()); - candidates.emplace(sent, lm_.perplexity_per_word(sent)); + auto score = lambda_ * lm_.perplexity_per_word(sent) + + (1.0 - lambda_) * sent.average_weight(); + candidates.emplace(sent, score); if (candidates.size() > max_cand_size_) candidates.pop(); } diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index 9087aff8e..792945be9 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -32,7 +32,7 @@ int main(int argc, char* argv[]) std::cout << (i + 1) << "." << std::endl; std::cout << " Sentence: " << candidates[i].first.to_string() << std::endl; - std::cout << " PPW: " << candidates[i].second << std::endl; + std::cout << " Score: " << candidates[i].second << std::endl; std::cout << " Edits:" << std::endl; for(auto& e: candidates[i].first.operations()) std::cout << " " << e << std::endl; From 98824db6f73d56fd1d029ba73a6783dbac61e46c Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 11 Nov 2014 12:22:56 -0600 Subject: [PATCH 023/481] prevent divide by zero if no edits to sentence --- src/lm/sentence.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index c890313b6..bc6c15ee8 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -87,6 +87,8 @@ void sentence::insert(size_type idx, const std::string& token, double sentence::average_weight() const { + if (weights_.empty()) + return 0.0; double sum = std::accumulate(weights_.begin(), weights_.end(), 0.0); return sum / weights_.size(); } From f0ad8d1a0c03fb5caa530467387168f05997d060 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 11 Nov 2014 20:14:59 -0600 Subject: [PATCH 024/481] make constructor arguments to lm and diff be cpptoml objects --- include/corpus/corpus.h | 4 +++- include/lm/diff.h | 7 +++--- include/lm/language_model.h | 15 +++++++------ src/corpus/corpus.cpp | 4 ++++ src/lm/diff.cpp | 15 +++++-------- src/lm/language_model.cpp | 39 +++++++++++++++++---------------- src/lm/tools/create-dataset.cpp | 29 ++++++++++++++++++------ src/lm/tools/lm-test.cpp | 3 ++- 8 files changed, 69 insertions(+), 47 deletions(-) diff --git a/include/corpus/corpus.h b/include/corpus/corpus.h index 5bdb952ab..87d755f41 100644 --- a/include/corpus/corpus.h +++ b/include/corpus/corpus.h @@ -12,7 +12,7 @@ #include #include - +#include "cpptoml.h" #include "meta.h" #include "corpus/document.h" @@ -65,6 +65,8 @@ class corpus */ static std::unique_ptr load(const std::string& config_file); + static std::unique_ptr load(const cpptoml::toml_group& config); + /** * Basic exception for corpus interactions. */ diff --git a/include/lm/diff.h b/include/lm/diff.h index 78f16edfc..cce495274 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -12,6 +12,7 @@ #include #include +#include "cpptoml.h" #include "lm/language_model.h" namespace meta @@ -24,7 +25,7 @@ class diff /** * @param config_file The file containing configuration information */ - diff(const std::string& config_file, + diff(const cpptoml::toml_group& config, uint64_t max_depth = default_max_depth_); /** @@ -39,12 +40,12 @@ class diff /** * @param config_file The file containing configuration information */ - void set_stems(const std::string& config_file); + void set_stems(const cpptoml::toml_group& config); /** * @param config_file The file containing configuration information */ - void set_function_words(const std::string& config_file); + void set_function_words(const cpptoml::toml_group& config); /** * @param sent diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 1ab16c061..e2f0c0145 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -14,6 +14,7 @@ #include #include #include +#include "cpptoml.h" #include "lm/sentence.h" namespace meta @@ -27,14 +28,14 @@ class language_model * Creates an N-gram language model based on the corpus specified in the * config file. */ - language_model(const std::string& config_file); + language_model(const cpptoml::toml_group& config); /** * Creates an N-gram language model based on the corpus specified in the * config file. * @param n The value of n, which overrides any setting in the config file */ - language_model(const std::string& config_file, size_t n); + language_model(const cpptoml::toml_group& config, size_t n); /** * Randomly generates one token sequence based on and symbols. @@ -82,15 +83,15 @@ class language_model private: /** * Builds the probabilities associated with this language model. - * @param config_file The config file that specifies the location of the + * @param config The config file that specifies the location of the * corpus */ - void learn_model(const std::string& config_file); + void learn_model(const cpptoml::toml_group& config); /** - * @param config_file + * @param config */ - void select_method(const std::string& config_file); + void select_method(const cpptoml::toml_group& config); /** * @param prefix Path to where the counts files are stored @@ -98,7 +99,7 @@ class language_model void read_precomputed(const std::string& prefix); /// The language_model used to interpolate with this one for smoothing - std::unique_ptr interp_; + std::shared_ptr interp_; // shared to allow copying /// Contains the N-gram distribution probabilities (N-1 words -> (w, prob)) std::unordered_map> diff --git a/src/corpus/corpus.cpp b/src/corpus/corpus.cpp index fd412f04d..9cdd11ff8 100644 --- a/src/corpus/corpus.cpp +++ b/src/corpus/corpus.cpp @@ -26,7 +26,11 @@ const std::string& corpus::encoding() const std::unique_ptr corpus::load(const std::string& config_file) { auto config = cpptoml::parse_file(config_file); + return load(config); +} +std::unique_ptr corpus::load(const cpptoml::toml_group& config) +{ auto type = config.get_as("corpus-type"); if (!type) throw corpus_exception{"corpus-type missing from configuration file"}; diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 648a53b31..552ce560a 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -8,18 +8,17 @@ #include #include #include "lm/diff.h" -#include "cpptoml.h" #include "porter2_stemmer.h" namespace meta { namespace lm { -diff::diff(const std::string& config_file, uint64_t max_depth) - : lm_{config_file}, max_depth_{max_depth} +diff::diff(const cpptoml::toml_group& config, uint64_t max_depth) + : lm_{config}, max_depth_{max_depth} { - set_stems(config_file); - set_function_words(config_file); + set_stems(config); + set_function_words(config); } std::vector> @@ -191,19 +190,17 @@ void diff::step(const sentence& sent, PQ& candidates, size_t depth) } } -void diff::set_function_words(const std::string& config_file) +void diff::set_function_words(const cpptoml::toml_group& config) { - auto config = cpptoml::parse_file(config_file); std::ifstream in{*config.get_as("function-words")}; std::string word; while (in >> word) fwords_.push_back(word); } -void diff::set_stems(const std::string& config_file) +void diff::set_stems(const cpptoml::toml_group& config) { std::unordered_set vocab; - auto config = cpptoml::parse_file(config_file); auto prefix = *config.get_as("prefix"); auto dataset = *config.get_as("dataset"); std::ifstream in{prefix + "/" + dataset + "/" + dataset + ".dat"}; diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 7a1121b48..b3c6b292d 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -10,7 +10,6 @@ #include #include #include -#include "cpptoml.h" #include "analyzers/analyzer.h" #include "analyzers/tokenizers/icu_tokenizer.h" #include "analyzers/filters/lowercase_filter.h" @@ -25,9 +24,8 @@ namespace meta namespace lm { -language_model::language_model(const std::string& config_file) +language_model::language_model(const cpptoml::toml_group& config) { - auto config = cpptoml::parse_file(config_file); auto group = config.get_group("language-model"); auto nval = group->get_as("n-value"); if (!nval) @@ -37,14 +35,13 @@ language_model::language_model(const std::string& config_file) N_ = *nval; if (N_ > 1) - interp_ = make_unique(config_file, N_ - 1); + interp_ = std::make_shared(config, N_ - 1); - select_method(config_file); + select_method(config); } -void language_model::select_method(const std::string& config_file) +void language_model::select_method(const cpptoml::toml_group& config) { - auto config = cpptoml::parse_file(config_file); auto group = config.get_group("language-model"); auto format = group->get_as("format"); if (!format) @@ -60,24 +57,25 @@ void language_model::select_method(const std::string& config_file) read_precomputed(*prefix); } else if (*format == "learn") - learn_model(config_file); + learn_model(config); else throw language_model_exception{ "language-model format could not be determined"}; } -language_model::language_model(const std::string& config_file, size_t n) : N_{n} +language_model::language_model(const cpptoml::toml_group& config, size_t n) + : N_{n} { if (N_ > 1) - interp_ = make_unique(config_file, N_ - 1); + interp_ = std::make_shared(config, N_ - 1); - select_method(config_file); + select_method(config); } -void language_model::learn_model(const std::string& config_file) +void language_model::learn_model(const cpptoml::toml_group& config) { std::cout << "Learning " << N_ << "-gram language model" << std::endl; - auto corpus = corpus::corpus::load(config_file); + auto corpus = corpus::corpus::load(config); using namespace analyzers; std::unique_ptr stream; @@ -267,15 +265,15 @@ double language_model::perplexity(const sentence& tokens) const { sentence ngram; for (size_t i = 1; i < N_; ++i) - // ngram.push_back(tokens[i - 1]); - ngram.push_back(""); + ngram.push_back(tokens[i - 1]); + // ngram.push_back(""); double perp = 0.0; - for (auto& token : tokens) - // for (size_t i = N_; i < tokens.size(); ++i) + // for (auto& token : tokens) + for (size_t i = N_; i < tokens.size(); ++i) { - ngram.push_back(token); - // ngram.push_back(tokens[i]); + // ngram.push_back(token); + ngram.push_back(tokens[i]); perp += std::log(1.0 / prob(ngram)); ngram.pop_front(); } @@ -285,6 +283,9 @@ double language_model::perplexity(const sentence& tokens) const double language_model::perplexity_per_word(const sentence& tokens) const { + if (tokens.size() == 0) + throw language_model_exception{ + "perplexity_per_word called on empty sentence"}; return perplexity(tokens) / tokens.size(); } } diff --git a/src/lm/tools/create-dataset.cpp b/src/lm/tools/create-dataset.cpp index ce60bd754..70da20e2d 100644 --- a/src/lm/tools/create-dataset.cpp +++ b/src/lm/tools/create-dataset.cpp @@ -6,6 +6,7 @@ #include #include #include "meta.h" +#include "cpptoml.h" #include "lm/diff.h" #include "lm/sentence.h" @@ -13,18 +14,32 @@ using namespace meta; int main(int argc, char* argv[]) { - lm::diff correcter{argv[1]}; + lm::diff correcter{cpptoml::parse_file(argv[1])}; std::string line; std::ifstream in{argv[2]}; + std::ofstream out{"edits.dat"}; while (in) { std::getline(in, line); - if(line.empty()) + if (line.empty()) continue; - lm::sentence sent{line}; - auto candidates = correcter.candidates(sent, false); - std::cout << candidates[0].first.to_string() << std::endl; - for (auto& e : candidates[0].first.operations()) - std::cout << " " << e << std::endl; + try + { + lm::sentence sent{line}; + auto candidates = correcter.candidates(sent, true); + auto edits = candidates[0].first.operations(); + if (edits.empty()) + out << "unmodified" << std::endl; + else + { + for (auto& e : edits) + out << e << " "; + out << std::endl; + } + } + catch (lm::sentence_exception& ex) + { + out << "error" << std::endl; + } } } diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index 792945be9..fbde6f3c4 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -4,6 +4,7 @@ */ #include +#include "cpptoml.h" #include "meta.h" #include "lm/diff.h" #include "lm/sentence.h" @@ -12,7 +13,7 @@ using namespace meta; int main(int argc, char* argv[]) { - lm::diff correcter{argv[1]}; + lm::diff correcter{cpptoml::parse_file(argv[1])}; std::string line; while (true) { From 981a84a404b5c5fd4e94851c6698ace53047c2b6 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 11 Nov 2014 20:15:18 -0600 Subject: [PATCH 025/481] sentence edge case where no edits --- src/lm/sentence.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index bc6c15ee8..b47d72976 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -3,6 +3,8 @@ * @author Sean Massung */ +#include + #include #include "lm/sentence.h" #include "analyzers/analyzer.h" @@ -27,9 +29,15 @@ sentence::sentence(const std::string& text) while (*stream) tokens_.push_back(stream->next()); + if (tokens_.empty()) + throw sentence_exception{"empty token stream"}; + // remove sentence markers (they're inserted by the LM) tokens_.pop_front(); tokens_.pop_back(); + + if (tokens_.empty()) + throw sentence_exception{"empty token stream"}; } std::string sentence::to_string() const From 4694bf54f50f489a982a44e0daf40319cd4b0c80 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 11 Nov 2014 20:15:41 -0600 Subject: [PATCH 026/481] skeleton for diff analyzer --- include/analyzers/all.h | 1 + include/analyzers/diff_analyzer.h | 63 ++++++++++++++++++++++++++++++ src/analyzers/CMakeLists.txt | 4 +- src/analyzers/analyzer_factory.cpp | 1 + src/analyzers/diff_analyzer.cpp | 57 +++++++++++++++++++++++++++ 5 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 include/analyzers/diff_analyzer.h create mode 100644 src/analyzers/diff_analyzer.cpp diff --git a/include/analyzers/all.h b/include/analyzers/all.h index d365a8674..7f4bfc689 100644 --- a/include/analyzers/all.h +++ b/include/analyzers/all.h @@ -1,5 +1,6 @@ #include "analyzers/analyzer.h" #include "analyzers/multi_analyzer.h" +#include "analyzers/diff_analyzer.h" #include "analyzers/libsvm_analyzer.h" diff --git a/include/analyzers/diff_analyzer.h b/include/analyzers/diff_analyzer.h new file mode 100644 index 000000000..e972aff40 --- /dev/null +++ b/include/analyzers/diff_analyzer.h @@ -0,0 +1,63 @@ +/** + * @file diff_analyzer.h + * @author Sean Massung + * + * All files in META are released under the MIT license. For more details, + * consult the file LICENSE in the root of the project. + */ + +#ifndef META_DIFF_ANALYZER_H_ +#define META_DIFF_ANALYZER_H_ + +#include "cpptoml.h" +#include "lm/diff.h" +#include "analyzers/analyzer_factory.h" +#include "analyzers/analyzer.h" +#include "util/clonable.h" + +namespace meta +{ +namespace analyzers +{ + +/** + * Analyzes documents using their tokenized words. + */ +class diff_analyzer : public util::clonable +{ + public: + diff_analyzer(const cpptoml::toml_group& config, + std::unique_ptr stream); + + /** + * Copy constructor. + * @param other The other diff_analyzer to copy from + */ + diff_analyzer(const diff_analyzer& other); + + /** + * Tokenizes a file into a document. + * @param doc The document to store the tokenized information in + */ + virtual void tokenize(corpus::document& doc) override; + + /// Identifier for this analyzer. + const static std::string id; + + private: + /// The token stream to be used for extracting tokens + std::unique_ptr stream_; + + lm::diff diff_; +}; + +/** + * Specialization of the factory method for creating diff_analyzers. + */ +template <> +std::unique_ptr make_analyzer( + const cpptoml::toml_group&, + const cpptoml::toml_group&); +} +} +#endif diff --git a/src/analyzers/CMakeLists.txt b/src/analyzers/CMakeLists.txt index 4a9fc6e43..925d44e40 100644 --- a/src/analyzers/CMakeLists.txt +++ b/src/analyzers/CMakeLists.txt @@ -8,9 +8,11 @@ add_subdirectory(tree) add_library(meta-analyzers analyzer.cpp analyzer_factory.cpp libsvm_analyzer.cpp + diff_analyzer.cpp multi_analyzer.cpp) target_link_libraries(meta-analyzers meta-corpus meta-filters meta-ngram-analyzers meta-tokenizers - meta-tree-analyzers) + meta-tree-analyzers + meta-language-model) diff --git a/src/analyzers/analyzer_factory.cpp b/src/analyzers/analyzer_factory.cpp index e3b5f622e..d1640e950 100644 --- a/src/analyzers/analyzer_factory.cpp +++ b/src/analyzers/analyzer_factory.cpp @@ -30,6 +30,7 @@ analyzer_factory::analyzer_factory() register_analyzer(); register_analyzer(); register_analyzer(); + register_analyzer(); } } } diff --git a/src/analyzers/diff_analyzer.cpp b/src/analyzers/diff_analyzer.cpp new file mode 100644 index 000000000..a7e85acb9 --- /dev/null +++ b/src/analyzers/diff_analyzer.cpp @@ -0,0 +1,57 @@ +/** + * @file diff_analyzer.cpp + * @author Sean Massung + */ + +#include +#include + +#include "corpus/document.h" +#include "analyzers/diff_analyzer.h" +#include "analyzers/token_stream.h" + +namespace meta +{ +namespace analyzers +{ + +const std::string diff_analyzer::id = "diff"; + +diff_analyzer::diff_analyzer(const cpptoml::toml_group& config, + std::unique_ptr stream) + : stream_{std::move(stream)}, diff_{config} +{ + // nothing +} + +diff_analyzer::diff_analyzer(const diff_analyzer& other) + : stream_{other.stream_->clone()}, diff_{other.diff_} +{ + // nothing +} + +void diff_analyzer::tokenize(corpus::document& doc) +{ + // first, get tokens + stream_->set_content(get_content(doc)); + std::vector tokens; + while (*stream_) + tokens.push_back(stream_->next()); + + doc.increment(tokens[0], 1); +} + +template <> +std::unique_ptr + make_analyzer(const cpptoml::toml_group& global, + const cpptoml::toml_group& config) +{ + auto filts = analyzer::load_filters(global, config); + auto diff_config = global.get_group("diff-config"); + if (!diff_config) + throw analyzer::analyzer_exception{ + "diff-config section needed for diff analyzer"}; + return make_unique(*diff_config, std::move(filts)); +} +} +} From 7caf691ffbe0233ff9a187292586edde45e5cb0d Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 12 Nov 2014 15:39:42 -0600 Subject: [PATCH 027/481] complete diff_analyzer and parameterize diff options --- include/lm/diff.h | 17 ++++++++++------- src/analyzers/diff_analyzer.cpp | 33 ++++++++++++++++++++++++++++++--- src/lm/diff.cpp | 15 ++++++++++++--- src/lm/sentence.cpp | 11 +++++++---- 4 files changed, 59 insertions(+), 17 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index cce495274..2766effb6 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -25,8 +25,7 @@ class diff /** * @param config_file The file containing configuration information */ - diff(const cpptoml::toml_group& config, - uint64_t max_depth = default_max_depth_); + diff(const cpptoml::toml_group& config); /** * @param sent The sentence to transform @@ -100,18 +99,22 @@ class diff void add(PQ& candidates, const sentence& sent); language_model lm_; - std::vector fwords_; + uint64_t n_val_; + uint64_t max_edits_; std::unordered_map> stems_; + std::vector fwords_; std::unordered_set seen_; - uint64_t max_depth_; - bool use_lm_; - static constexpr uint64_t default_max_depth_ = 3; - static constexpr uint64_t n_val_ = 3; static constexpr uint64_t max_cand_size_ = 100; + bool use_lm_; /// balance between perplexity and edit weights static constexpr double lambda_ = 0.5; }; + +class diff_exception : public std::runtime_error +{ + using std::runtime_error::runtime_error; +}; } } diff --git a/src/analyzers/diff_analyzer.cpp b/src/analyzers/diff_analyzer.cpp index a7e85acb9..109cf7a02 100644 --- a/src/analyzers/diff_analyzer.cpp +++ b/src/analyzers/diff_analyzer.cpp @@ -34,11 +34,38 @@ void diff_analyzer::tokenize(corpus::document& doc) { // first, get tokens stream_->set_content(get_content(doc)); - std::vector tokens; + std::vector sentences; + std::string buffer{""}; + while (*stream_) - tokens.push_back(stream_->next()); + { + auto next = stream_->next(); + buffer += next + " "; + if (next == "") + sentences.emplace_back(std::move(buffer)); + } + + for(auto& s: sentences) + { + try + { + lm::sentence sent{s}; + auto candidates = diff_.candidates(sent, true); + auto edits = candidates[0].first.operations(); + if (edits.empty()) + doc.increment("unmodified", 1); + else + { + for (auto& e : edits) + doc.increment(e, 1); + } + } + catch (lm::sentence_exception& ex) + { + doc.increment("error", 1); + } - doc.increment(tokens[0], 1); + } } template <> diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 552ce560a..1ce3328e2 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -14,9 +14,18 @@ namespace meta { namespace lm { -diff::diff(const cpptoml::toml_group& config, uint64_t max_depth) - : lm_{config}, max_depth_{max_depth} +diff::diff(const cpptoml::toml_group& config) : lm_{config} { + auto nval = config.get_as("n-value"); + if (!nval) + throw diff_exception{"n-value not specified in config"}; + n_val_ = *nval; + + auto edits = config.get_as("max-edits"); + if (!edits) + throw diff_exception{"max-edits not specified in config"}; + max_edits_ = *edits; + set_stems(config); set_function_words(config); } @@ -174,7 +183,7 @@ void diff::substitute(const sentence& sent, size_t idx, PQ& candidates, template void diff::step(const sentence& sent, PQ& candidates, size_t depth) { - if (depth == max_depth_) + if (depth == max_edits_) return; if (use_lm_) diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index b47d72976..61ad35bc0 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -72,15 +72,17 @@ sentence sentence::operator()(size_type from, size_type to) const void sentence::substitute(size_type idx, const std::string& token, double weight /* = 0.0 */) { - ops_.push_back("substitute(" + std::to_string(idx) + ", " + tokens_[idx] - + " -> " + token + ")"); + //ops_.push_back("substitute(" + std::to_string(idx) + ", " + tokens_[idx] + // + " -> " + token + ")"); + ops_.push_back("substitute(" + tokens_[idx] + " -> " + token + ")"); tokens_[idx] = token; weights_.push_back(weight); } void sentence::remove(size_type idx, double weight /* = 0.0 */) { - ops_.push_back("remove(" + std::to_string(idx) + ", " + (*this)[idx] + ")"); + //ops_.push_back("remove(" + std::to_string(idx) + ", " + (*this)[idx] + ")"); + ops_.push_back("remove(" + (*this)[idx] + ")"); tokens_.erase(tokens_.begin() + idx); weights_.push_back(weight); } @@ -89,7 +91,8 @@ void sentence::insert(size_type idx, const std::string& token, double weight /* = 0.0 */) { tokens_.insert(tokens_.begin() + idx, token); - ops_.push_back("insert(" + std::to_string(idx) + ", " + token + ")"); + //ops_.push_back("insert(" + std::to_string(idx) + ", " + token + ")"); + ops_.push_back("insert(" + token + ")"); weights_.push_back(weight); } From 0f281c92b54642f702822630b0bc3e1869fe30df Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 16 Nov 2014 16:00:39 -0600 Subject: [PATCH 028/481] configure language_model back to learn the distribution --- include/lm/diff.h | 2 +- src/lm/language_model.cpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index 2766effb6..6bae270a4 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -104,7 +104,7 @@ class diff std::unordered_map> stems_; std::vector fwords_; std::unordered_set seen_; - static constexpr uint64_t max_cand_size_ = 100; + static constexpr uint64_t max_cand_size_ = 20; bool use_lm_; /// balance between perplexity and edit weights diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index b3c6b292d..6a20cb677 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -265,15 +265,15 @@ double language_model::perplexity(const sentence& tokens) const { sentence ngram; for (size_t i = 1; i < N_; ++i) - ngram.push_back(tokens[i - 1]); - // ngram.push_back(""); + // ngram.push_back(tokens[i - 1]); + ngram.push_back(""); double perp = 0.0; - // for (auto& token : tokens) - for (size_t i = N_; i < tokens.size(); ++i) + for (auto& token : tokens) + //for (size_t i = N_; i < tokens.size(); ++i) { - // ngram.push_back(token); - ngram.push_back(tokens[i]); + ngram.push_back(token); + //ngram.push_back(tokens[i]); perp += std::log(1.0 / prob(ngram)); ngram.pop_front(); } From da48e58a2e57fd397cea52a5c8b50bfe60b32bb4 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 21 Nov 2014 09:19:16 -0600 Subject: [PATCH 029/481] fix config file reading in create-dataset for diff; add debug option --- src/lm/tools/create-dataset.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/lm/tools/create-dataset.cpp b/src/lm/tools/create-dataset.cpp index 70da20e2d..57088c94b 100644 --- a/src/lm/tools/create-dataset.cpp +++ b/src/lm/tools/create-dataset.cpp @@ -14,7 +14,9 @@ using namespace meta; int main(int argc, char* argv[]) { - lm::diff correcter{cpptoml::parse_file(argv[1])}; + bool diagnostic = true; + auto config = cpptoml::parse_file(argv[1]); + lm::diff correcter{*config.get_group("diff-config")}; std::string line; std::ifstream in{argv[2]}; std::ofstream out{"edits.dat"}; @@ -25,6 +27,11 @@ int main(int argc, char* argv[]) continue; try { + if (diagnostic) + { + out << std::endl; + out << line << std::endl; + } lm::sentence sent{line}; auto candidates = correcter.candidates(sent, true); auto edits = candidates[0].first.operations(); @@ -36,6 +43,8 @@ int main(int argc, char* argv[]) out << e << " "; out << std::endl; } + if (diagnostic) + out << candidates[0].first.to_string() << std::endl; } catch (lm::sentence_exception& ex) { From 207a63ad306cdb58d933a96ae017a745f9e78a16 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 21 Nov 2014 23:14:03 -0600 Subject: [PATCH 030/481] removed outdated gitignore rules --- .gitignore | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitignore b/.gitignore index 348ee951e..2ccf2cdc3 100644 --- a/.gitignore +++ b/.gitignore @@ -7,9 +7,4 @@ doc/ .*.swp *.o *.class -*.pyc -learn -features -search -tester .* From 378122017df3ad5066615f53398809a8f8156059 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 21 Nov 2014 23:14:28 -0600 Subject: [PATCH 031/481] start work on feature selection --- include/features/feature_selector.h | 126 +++++++++++++++++++++++++ src/CMakeLists.txt | 1 + src/features/CMakeLists.txt | 6 ++ src/features/feature_selector.cpp | 48 ++++++++++ src/features/tools/CMakeLists.txt | 2 + src/features/tools/feature_summary.cpp | 40 ++++++++ 6 files changed, 223 insertions(+) create mode 100644 include/features/feature_selector.h create mode 100644 src/features/CMakeLists.txt create mode 100644 src/features/feature_selector.cpp create mode 100644 src/features/tools/CMakeLists.txt create mode 100644 src/features/tools/feature_summary.cpp diff --git a/include/features/feature_selector.h b/include/features/feature_selector.h new file mode 100644 index 000000000..c5e0e33c2 --- /dev/null +++ b/include/features/feature_selector.h @@ -0,0 +1,126 @@ +/** + * @file feature_selector.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_FEATURE_SELECTOR_H_ +#define META_FEATURE_SELECTOR_H_ + +#include +#include +#include +#include + +#include "util/disk_vector.h" +#include "index/forward_index.h" +#include "io/binary.h" + +namespace meta +{ +namespace features +{ +/** + * The base class that shows the feature selection interface for MeTA, allowing + * dimensionality reduction for documents as well as descriptions of classes by + * their useful features. + */ +class feature_selector +{ + public: + /** + * @param filename + * @param idx + */ + feature_selector(const std::string& prefix, + std::shared_ptr idx); + + /** + * Default destructor. + */ + virtual ~feature_selector() = default; + + /** + * Prints a summary of which features are representative for each class. + */ + void print_summary() const; + + protected: + /** + * Probability of term occuring in class + * \f$ P(t, c) = \frac{c(t, c)}{T} \f$ + * @param term + * @param label + * @return P(t, c) + */ + double term_and_class(term_id term, label_id label) const; + + /** + * Probability of not seeing a term and a class: + * \f$ P(t', c) = P(c) - P(t, c) \f$ + * @param term + * @param label + * @return P(t', c) + */ + double not_term_and_class(term_id term, label_id label) const; + + /** + * Probability of term not occuring in a class: + * \f$ P(t, c') = P(t) - P(t, c) \f$ + * @param term + * @param label + * @return P(t, c') + */ + double term_and_not_class(term_id term, label_id label) const; + + /** + * Probability not in class c in which term t does not occur: + * \f$ P(t', c') = 1 - P(t, c) - P(t', c) - P(t, c') \f$ + * @param term + * @param label + * @return P(t', c') + */ + double not_term_and_not_class(term_id term, label_id label) const; + + /** + * The internal implementation of a feature_selector object is a disk_vector + * and a collection of binary files. The disk_vector allows constant-time + * access to look up a term_id and check whether it has been "selected". The + * binary files are sorted by feature score for easy summary operations as + * well as changing which top features are set to be selected. + * + * This base feature_selector class calculates and contains four + * distributions which may be used to calculate different feature selection + * scores, implemented as derived classes. + */ + + /// Where the feature selection data is stored + const std::string prefix_; + + /// The forward_index this feature selection is being performed on + std::shared_ptr idx_; + + /// Whether or not a term_id is currently selected + util::disk_vector selected_; + + /// Binary files containing features sorted by scores, indexed by label_id + std::vector sorted_features_; + + /** The following three classes are only used during initial creation */ + + /// P(t) in the entire collection, indexed by term_id + std::vector term_prob_; + + /// P(c) in the collection, indexed by label_id + std::vector class_prob_; + + /// P(c,t) indexed by label_id and term_id + std::vector> co_occur_; +}; +} +} + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b0c6bb44f..8ae996751 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,6 +3,7 @@ project(meta) add_subdirectory(analyzers) add_subdirectory(classify) add_subdirectory(corpus) +add_subdirectory(features) add_subdirectory(graph) add_subdirectory(index) add_subdirectory(io) diff --git a/src/features/CMakeLists.txt b/src/features/CMakeLists.txt new file mode 100644 index 000000000..a45c01be1 --- /dev/null +++ b/src/features/CMakeLists.txt @@ -0,0 +1,6 @@ +project(meta-features) + +add_subdirectory(tools) + +add_library(meta-features feature_selector.cpp) +target_link_libraries(meta-features meta-index ${CMAKE_THREAD_LIBS_INIT}) diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp new file mode 100644 index 000000000..32631dce0 --- /dev/null +++ b/src/features/feature_selector.cpp @@ -0,0 +1,48 @@ +/** + * @file feature_selector.cpp + * @author Sean Massung + */ + +#include +#include "features/feature_selector.h" + +namespace meta +{ +namespace features +{ +feature_selector::feature_selector(const std::string& prefix, + std::shared_ptr idx) + : prefix_{prefix}, + idx_{std::move(idx)}, + selected_{prefix + ".selected", idx_->unique_terms()} +{ +} + +void feature_selector::print_summary() const +{ + std::cout << "Feature summary" << std::endl; +} + +double feature_selector::term_and_class(term_id term, label_id label) const +{ + return co_occur_[label][term]; +} + +double feature_selector::not_term_and_class(term_id term, label_id label) const +{ + return 1.0 - term_and_class(term, label) - not_term_and_class(term, label) + - term_and_not_class(term, label); +} + +double feature_selector::term_and_not_class(term_id term, label_id label) const +{ + return term_prob_[term] - term_and_class(term, label); +} + +double feature_selector::not_term_and_not_class(term_id term, + label_id label) const +{ + return class_prob_[label] - term_and_class(term, label); +} +} +} diff --git a/src/features/tools/CMakeLists.txt b/src/features/tools/CMakeLists.txt new file mode 100644 index 000000000..8032b0e95 --- /dev/null +++ b/src/features/tools/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(feature-summary feature_summary.cpp) +target_link_libraries(feature-summary meta-features meta-index) diff --git a/src/features/tools/feature_summary.cpp b/src/features/tools/feature_summary.cpp new file mode 100644 index 000000000..f4cea9dd3 --- /dev/null +++ b/src/features/tools/feature_summary.cpp @@ -0,0 +1,40 @@ +/** + * @file feature-summary.cpp + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#include +#include "cpptoml.h" +#include "features/feature_selector.h" +#include "index/forward_index.h" + +using namespace meta; + +int main(int argc, char* argv[]) +{ + if (argc != 2) + { + std::cerr << "Usage:\t" << argv[0] << " config.toml" << std::endl; + return 1; + } + + logging::set_cerr_logging(); + + auto config = cpptoml::parse_file(argv[1]); + auto feature_config = config.get_group("features"); + if (!feature_config) + { + std::cerr << "Missing [features] config group" << std::endl; + return 1; + } + + auto f_idx = index::make_index(argv[1]); + auto f_prefix = feature_config->get_as("prefix"); + features::feature_selector selector{*f_prefix, f_idx}; + + selector.print_summary(); +} From 2d06c05b0f6d1a783040c7e18a6212043b617fa2 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 21 Nov 2014 23:17:39 -0600 Subject: [PATCH 032/481] update config file for [features] group --- config.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config.toml b/config.toml index 2d5ef259e..64f37de3a 100644 --- a/config.toml +++ b/config.toml @@ -46,3 +46,6 @@ section-size = 99 train-sections = [0, 18] dev-sections = [19, 21] test-sections = [22, 24] + +[features] +prefix = "features" From 1c96f83a796343c1ba92cb23b340eee987117409 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 22 Nov 2014 12:53:39 -0600 Subject: [PATCH 033/481] work on feature_selector --- include/features/feature_selector.h | 66 +++++++++++++-- include/features/information_gain.h | 65 +++++++++++++++ src/features/feature_selector.cpp | 111 +++++++++++++++++++++++-- src/features/tools/feature_summary.cpp | 6 +- 4 files changed, 231 insertions(+), 17 deletions(-) create mode 100644 include/features/information_gain.h diff --git a/include/features/feature_selector.h b/include/features/feature_selector.h index c5e0e33c2..fbcd9ff61 100644 --- a/include/features/feature_selector.h +++ b/include/features/feature_selector.h @@ -44,11 +44,57 @@ class feature_selector virtual ~feature_selector() = default; /** - * Prints a summary of which features are representative for each class. + * Prints a summary of the top k features for each class. + * @param k */ - void print_summary() const; + virtual void print_summary(uint64_t k = 20) const; + + /** + * @param term + * @return whether the given term is currently "selected" + */ + virtual bool selected(term_id term) const; + + /** + * Sets the top k features for *each class* to be "selected" + * @param k + */ + virtual void select(uint64_t k = 25); + + /** + * Selects approximately the top p percent features for the entire dataset, + * \f$ p\in[0,1] \f$. Each class will then have \f$\frac{p\cdot |V|}{ + * |L|} \f$ features selected, where \f$|L|\f$ is the number of classes. + * @param p + */ + virtual void select_percent(double p = 0.05); protected: + /** + * Creates the state of this feature_selector if necessary; this logic is + * outside the constructor since it requires pure virtual functions + * implemented by deriving classes. + */ + void init(); + + /** + * Scores a (label, term) pair in the index according to the derived class's + * feature selection method + * @param lid + * @param tid + */ + virtual double score(label_id lid, term_id tid) const = 0; + + /** + * @return the probability of a specific term in the index + */ + double prob_term(term_id id) const; + + /** + * @return the probability of a specific class in the index + */ + double prob_class(label_id id) const; + /** * Probability of term occuring in class * \f$ P(t, c) = \frac{c(t, c)}{T} \f$ @@ -85,6 +131,7 @@ class feature_selector */ double not_term_and_not_class(term_id term, label_id label) const; + private: /** * The internal implementation of a feature_selector object is a disk_vector * and a collection of binary files. The disk_vector allows constant-time @@ -97,6 +144,17 @@ class feature_selector * scores, implemented as derived classes. */ + /** + * Calculates the probabilities of terms and classes given the current + * index. + */ + void calc_probs(); + + /** + * Calculates the feature score for each (label, term) pair. + */ + void score_all(); + /// Where the feature selection data is stored const std::string prefix_; @@ -109,15 +167,13 @@ class feature_selector /// Binary files containing features sorted by scores, indexed by label_id std::vector sorted_features_; - /** The following three classes are only used during initial creation */ - /// P(t) in the entire collection, indexed by term_id std::vector term_prob_; /// P(c) in the collection, indexed by label_id std::vector class_prob_; - /// P(c,t) indexed by label_id and term_id + /// P(c,t) indexed by [label_id][term_id] std::vector> co_occur_; }; } diff --git a/include/features/information_gain.h b/include/features/information_gain.h new file mode 100644 index 000000000..d405092d8 --- /dev/null +++ b/include/features/information_gain.h @@ -0,0 +1,65 @@ +/** + * @file information_gain.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_INFORMATION_GAIN_H_ +#define META_INFORMATION_GAIN_H_ + +#include "features/feature_selector.h" + +namespace meta +{ +namespace features +{ +/** + * Performs information gain feature selection: + * \f$ IG(t, c_i) = + * \sum_{c\in\{c_i, \overline{c_i}\}} \sum_{t'\in\{t,t'\}}P(t',c) \log + * \frac{P(t',c)}{P(t')P(c)} \f$ + */ +class information_gain : public feature_selector +{ + public: + /** + * Constructor. + */ + information_gain(const std::string& prefix, + std::shared_ptr idx) + : feature_selector{prefix, std::move(idx)} + { + init(); + } + + /** + * Scores the (label, term) pair according to this feature selection + * metric. + * @param lid + * @param tid + */ + virtual double score(label_id lid, term_id tid) const override + { + double p_tc = term_and_class(tid, lid); + double p_ntnc = not_term_and_not_class(tid, lid); + double p_ntc = not_term_and_class(tid, lid); + double p_tnc = term_and_not_class(tid, lid); + double p_c = prob_class(lid); + double p_t = prob_term(tid); + double p_nc = 1.0 - p_c; + double p_nt = 1.0 - p_t; + + double gain_tc = p_tc * std::log(p_tc / (p_t * p_c)); + double gain_ntnc = p_ntnc * std::log(p_ntnc / (p_nt * p_nc)); + double gain_ntc = p_ntc * std::log(p_ntc / (p_nt * p_c)); + double gain_tnc = p_tnc * std::log(p_tnc / (p_t * p_nc)); + + return gain_tc + gain_ntnc + gain_ntc + gain_tnc; + } +}; +} +} +#endif diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp index 32631dce0..49e4a13c4 100644 --- a/src/features/feature_selector.cpp +++ b/src/features/feature_selector.cpp @@ -4,7 +4,10 @@ */ #include +#include "util/filesystem.h" +#include "util/progress.h" #include "features/feature_selector.h" +#include "index/postings_data.h" namespace meta { @@ -14,21 +17,112 @@ feature_selector::feature_selector(const std::string& prefix, std::shared_ptr idx) : prefix_{prefix}, idx_{std::move(idx)}, - selected_{prefix + ".selected", idx_->unique_terms()} + selected_{prefix_ + ".selected", idx_->unique_terms()} +{ /* nothing */ +} + +void feature_selector::init() +{ + // if the first class distribution doesn't exist, we haven't created the + // data for this feature_selector yet + if (!filesystem::file_exists(prefix_ + ".0")) + { + // initially set all probabilities to zero; this allows fast random + // access to the probabilities + term_prob_.assign(idx_->unique_terms(), 0.0); + class_prob_.assign(idx_->num_labels(), 0.0); + co_occur_.assign(idx_->num_labels(), + std::vector(idx_->unique_terms(), 0.0)); + calc_probs(); + score_all(); + } + else + { + std::cout << "Loading feature_selector not implemented yet" + << std::endl; + } +} + +void feature_selector::score_all() +{ + printing::progress prog{" > Selecting features: ", term_prob_.size()}; + for (uint64_t tid = 0; tid < term_prob_.size(); ++tid) + { + prog(tid); + for (uint64_t lbl = 0; lbl < class_prob_.size(); ++lbl) + score(label_id{lbl + 1}, term_id{tid}); + } + prog.end(); +} + +void feature_selector::select(uint64_t k /* = 25 */) +{ +} + +bool feature_selector::selected(term_id term) const +{ + return false; +} + +void feature_selector::select_percent(double p /* = 0.05 */) +{ + double num_features = p * idx_->unique_terms(); + uint64_t per_class = num_features / idx_->num_labels(); // truncate to int + select(per_class); +} + +void feature_selector::calc_probs() +{ + printing::progress prog{" > Calculating feature probs: ", + idx_->num_docs()}; + uint64_t total_terms = 0; + for (doc_id did = doc_id{0}; did < idx_->num_docs(); ++did) + { + prog(did); + auto lid = idx_->lbl_id(did); + ++class_prob_[lid - 1]; + for (auto& count : idx_->search_primary(did)->counts()) + { + term_prob_[count.first] += count.second; + co_occur_[lid - 1][count.first] += count.second; + total_terms += count.second; + } + } + prog.end(); + + for (auto& p : class_prob_) + p /= idx_->num_labels(); + + for (auto& p : term_prob_) + p /= total_terms; + + for (auto& probs : co_occur_) + for (auto& p : probs) + p /= total_terms; +} + +void feature_selector::print_summary(uint64_t k /* = 20 */) const +{ + std::cout << "Feature summary: top " << k << " features" << std::endl; +} + +double feature_selector::prob_term(term_id id) const { + return term_prob_.at(id); } -void feature_selector::print_summary() const +double feature_selector::prob_class(label_id id) const { - std::cout << "Feature summary" << std::endl; + return class_prob_.at(id - 1); } double feature_selector::term_and_class(term_id term, label_id label) const { - return co_occur_[label][term]; + return co_occur_.at(label - 1).at(term); } -double feature_selector::not_term_and_class(term_id term, label_id label) const +double feature_selector::not_term_and_not_class(term_id term, + label_id label) const { return 1.0 - term_and_class(term, label) - not_term_and_class(term, label) - term_and_not_class(term, label); @@ -36,13 +130,12 @@ double feature_selector::not_term_and_class(term_id term, label_id label) const double feature_selector::term_and_not_class(term_id term, label_id label) const { - return term_prob_[term] - term_and_class(term, label); + return term_prob_.at(term) - term_and_class(term, label); } -double feature_selector::not_term_and_not_class(term_id term, - label_id label) const +double feature_selector::not_term_and_class(term_id term, label_id label) const { - return class_prob_[label] - term_and_class(term, label); + return class_prob_.at(label - 1) - term_and_class(term, label); } } } diff --git a/src/features/tools/feature_summary.cpp b/src/features/tools/feature_summary.cpp index f4cea9dd3..872be2942 100644 --- a/src/features/tools/feature_summary.cpp +++ b/src/features/tools/feature_summary.cpp @@ -9,7 +9,8 @@ #include #include "cpptoml.h" -#include "features/feature_selector.h" +#include "util/shim.h" +#include "features/information_gain.h" #include "index/forward_index.h" using namespace meta; @@ -34,7 +35,6 @@ int main(int argc, char* argv[]) auto f_idx = index::make_index(argv[1]); auto f_prefix = feature_config->get_as("prefix"); - features::feature_selector selector{*f_prefix, f_idx}; - + features::information_gain selector{*f_prefix, f_idx}; selector.print_summary(); } From 30d38cbfc8af9b7d2fc21133991f6c221e7d6cc1 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 22 Nov 2014 14:01:26 -0600 Subject: [PATCH 034/481] selection summary --- include/features/feature_selector.h | 5 --- include/features/information_gain.h | 10 ++++++ src/features/feature_selector.cpp | 52 ++++++++++++++++++++++++++--- 3 files changed, 58 insertions(+), 9 deletions(-) diff --git a/include/features/feature_selector.h b/include/features/feature_selector.h index fbcd9ff61..9bc406731 100644 --- a/include/features/feature_selector.h +++ b/include/features/feature_selector.h @@ -11,13 +11,11 @@ #define META_FEATURE_SELECTOR_H_ #include -#include #include #include #include "util/disk_vector.h" #include "index/forward_index.h" -#include "io/binary.h" namespace meta { @@ -164,9 +162,6 @@ class feature_selector /// Whether or not a term_id is currently selected util::disk_vector selected_; - /// Binary files containing features sorted by scores, indexed by label_id - std::vector sorted_features_; - /// P(t) in the entire collection, indexed by term_id std::vector term_prob_; diff --git a/include/features/information_gain.h b/include/features/information_gain.h index d405092d8..cc759336c 100644 --- a/include/features/information_gain.h +++ b/include/features/information_gain.h @@ -57,6 +57,16 @@ class information_gain : public feature_selector double gain_ntc = p_ntc * std::log(p_ntc / (p_nt * p_c)); double gain_tnc = p_tnc * std::log(p_tnc / (p_t * p_nc)); + // if any denominators were zero, make the expression zero + if (std::isnan(gain_tc)) + gain_tc = 0.0; + if (std::isnan(gain_ntnc)) + gain_ntnc = 0.0; + if (std::isnan(gain_ntc)) + gain_ntc = 0.0; + if (std::isnan(gain_tnc)) + gain_tnc = 0.0; + return gain_tc + gain_ntnc + gain_ntc + gain_tnc; } }; diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp index 49e4a13c4..d8648318a 100644 --- a/src/features/feature_selector.cpp +++ b/src/features/feature_selector.cpp @@ -4,10 +4,13 @@ */ #include +#include #include "util/filesystem.h" #include "util/progress.h" +#include "parallel/parallel_for.h" #include "features/feature_selector.h" #include "index/postings_data.h" +#include "io/binary.h" namespace meta { @@ -45,14 +48,38 @@ void feature_selector::init() void feature_selector::score_all() { + using pair_t = std::pair; + std::vector> scores( + class_prob_.size(), std::vector(term_prob_.size())); + printing::progress prog{" > Selecting features: ", term_prob_.size()}; for (uint64_t tid = 0; tid < term_prob_.size(); ++tid) { prog(tid); for (uint64_t lbl = 0; lbl < class_prob_.size(); ++lbl) - score(label_id{lbl + 1}, term_id{tid}); + scores[lbl][tid] + = std::make_pair(tid, score(label_id{lbl + 1}, term_id{tid})); } prog.end(); + + parallel::parallel_for(scores.begin(), scores.end(), [&](auto& v) + { + std::sort(v.begin(), v.end(), [&](const auto& a, const auto& b) + { + return a.second < b.second; + }); + }); + + for (uint64_t lbl = 0; lbl < class_prob_.size(); ++lbl) + { + // write (term_id, score) pairs + std::ofstream out{prefix_ + "." + std::to_string(lbl + 1)}; + for (auto& score : scores[lbl]) + { + io::write_binary(out, score.first); + io::write_binary(out, score.second); + } + } } void feature_selector::select(uint64_t k /* = 25 */) @@ -73,8 +100,7 @@ void feature_selector::select_percent(double p /* = 0.05 */) void feature_selector::calc_probs() { - printing::progress prog{" > Calculating feature probs: ", - idx_->num_docs()}; + printing::progress prog{" > Calculating feature probs: ", idx_->num_docs()}; uint64_t total_terms = 0; for (doc_id did = doc_id{0}; did < idx_->num_docs(); ++did) { @@ -103,7 +129,25 @@ void feature_selector::calc_probs() void feature_selector::print_summary(uint64_t k /* = 20 */) const { - std::cout << "Feature summary: top " << k << " features" << std::endl; + term_id tid; + double score; + for (uint64_t lbl = 0; lbl < class_prob_.size(); ++lbl) + { + std::cout << std::endl << "Top " << k << " features for \"" + << idx_->class_label_from_id(label_id{lbl + 1}) + << "\":" << std::endl + << "===============================" << std::endl; + + // read (term_id, score) pairs + std::ifstream in{prefix_ + "." + std::to_string(lbl + 1)}; + for (uint64_t i = 0; i < k; ++i) + { + io::read_binary(in, tid); + io::read_binary(in, score); + std::cout << (i + 1) << ". " << idx_->term_text(tid) << " (" + << score << ")" << std::endl; + } + } } double feature_selector::prob_term(term_id id) const From 36076bde43ea5be2db57bd671a213c3b96f21a50 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 25 Nov 2014 10:25:50 -0600 Subject: [PATCH 035/481] work on feature_selector; add chi_square --- include/features/all.h | 2 + include/features/chi_square.h | 63 ++++++++++++++++++++++++++ include/features/feature_selector.h | 5 +- include/features/information_gain.h | 7 +-- src/features/feature_selector.cpp | 43 ++++++++++++------ src/features/tools/feature_summary.cpp | 6 ++- 6 files changed, 106 insertions(+), 20 deletions(-) create mode 100644 include/features/all.h create mode 100644 include/features/chi_square.h diff --git a/include/features/all.h b/include/features/all.h new file mode 100644 index 000000000..0f121b77b --- /dev/null +++ b/include/features/all.h @@ -0,0 +1,2 @@ +#include "features/chi_square.h" +#include "features/information_gain.h" diff --git a/include/features/chi_square.h b/include/features/chi_square.h new file mode 100644 index 000000000..950c47410 --- /dev/null +++ b/include/features/chi_square.h @@ -0,0 +1,63 @@ +/** + * @file chi_square.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_CHI_SQUARE_H_ +#define META_CHI_SQUARE_H_ + +#include "features/feature_selector.h" + +namespace meta +{ +namespace features +{ +/** + * Performs Chi square feature selection: + * \f$ \chi^2(t, c_i) = + * \frac{(P(t,c_i) P(\overline{t}, \overline{c_i}) - P(t, \overline{c_i}) + * P(\overline{t},c_i))^2} + * {P(t) P(\overline{t}) P(c_i) P(\overline{c_i})} \f$ + */ +class chi_square : public feature_selector +{ + public: + /** + * Constructor. + */ + chi_square(const std::string& prefix, + std::shared_ptr idx, + uint64_t features_per_class = 20) + : feature_selector{prefix + ".chi", std::move(idx)} + { + init(features_per_class); + } + + /** + * Scores the (label_id, term) pair according to this feature selection + * metric. + * @param lid + * @param tid + */ + virtual double score(label_id lid, term_id tid) const override + { + double p_tc = term_and_class(tid, lid); + double p_ntnc = not_term_and_not_class(tid, lid); + double p_ntc = not_term_and_class(tid, lid); + double p_tnc = term_and_not_class(tid, lid); + double p_c = prob_class(lid); + double p_t = prob_term(tid); + + double numerator = p_tc * p_ntnc - p_ntc * p_tnc; + double denominator = p_c * (1.0 - p_c) * p_t * (1.0 - p_t); + + return (numerator * numerator) / denominator; + } +}; +} +} +#endif diff --git a/include/features/feature_selector.h b/include/features/feature_selector.h index 9bc406731..94f5e7605 100644 --- a/include/features/feature_selector.h +++ b/include/features/feature_selector.h @@ -57,7 +57,7 @@ class feature_selector * Sets the top k features for *each class* to be "selected" * @param k */ - virtual void select(uint64_t k = 25); + virtual void select(uint64_t k = 20); /** * Selects approximately the top p percent features for the entire dataset, @@ -72,8 +72,9 @@ class feature_selector * Creates the state of this feature_selector if necessary; this logic is * outside the constructor since it requires pure virtual functions * implemented by deriving classes. + * @param features_per_class */ - void init(); + void init(uint64_t features_per_class); /** * Scores a (label, term) pair in the index according to the derived class's diff --git a/include/features/information_gain.h b/include/features/information_gain.h index cc759336c..b2d72ff21 100644 --- a/include/features/information_gain.h +++ b/include/features/information_gain.h @@ -29,10 +29,11 @@ class information_gain : public feature_selector * Constructor. */ information_gain(const std::string& prefix, - std::shared_ptr idx) - : feature_selector{prefix, std::move(idx)} + std::shared_ptr idx, + uint64_t features_per_class = 20) + : feature_selector{prefix + ".ig", std::move(idx)} { - init(); + init(features_per_class); } /** diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp index d8648318a..d7bde533f 100644 --- a/src/features/feature_selector.cpp +++ b/src/features/feature_selector.cpp @@ -24,11 +24,11 @@ feature_selector::feature_selector(const std::string& prefix, { /* nothing */ } -void feature_selector::init() +void feature_selector::init(uint64_t features_per_class) { // if the first class distribution doesn't exist, we haven't created the // data for this feature_selector yet - if (!filesystem::file_exists(prefix_ + ".0")) + if (!filesystem::file_exists(prefix_ + ".1")) { // initially set all probabilities to zero; this allows fast random // access to the probabilities @@ -38,11 +38,7 @@ void feature_selector::init() std::vector(idx_->unique_terms(), 0.0)); calc_probs(); score_all(); - } - else - { - std::cout << "Loading feature_selector not implemented yet" - << std::endl; + select(features_per_class); } } @@ -53,10 +49,10 @@ void feature_selector::score_all() class_prob_.size(), std::vector(term_prob_.size())); printing::progress prog{" > Selecting features: ", term_prob_.size()}; - for (uint64_t tid = 0; tid < term_prob_.size(); ++tid) + for (uint64_t tid = 0; tid < idx_->unique_terms(); ++tid) { prog(tid); - for (uint64_t lbl = 0; lbl < class_prob_.size(); ++lbl) + for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) scores[lbl][tid] = std::make_pair(tid, score(label_id{lbl + 1}, term_id{tid})); } @@ -70,7 +66,7 @@ void feature_selector::score_all() }); }); - for (uint64_t lbl = 0; lbl < class_prob_.size(); ++lbl) + for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) { // write (term_id, score) pairs std::ofstream out{prefix_ + "." + std::to_string(lbl + 1)}; @@ -82,13 +78,34 @@ void feature_selector::score_all() } } -void feature_selector::select(uint64_t k /* = 25 */) +void feature_selector::select(uint64_t features_per_class /* = 20 */) { + term_id id; + double score; + std::unordered_set terms; + for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) + { + std::ifstream in{prefix_ + "." + std::to_string(lbl + 1)}; + for (uint64_t i = 0; i < features_per_class; ++i) + { + io::read_binary(in, id); + io::read_binary(in, score); + terms.insert(id); + } + } + + // zero out old vector + for (auto& b : selected_) + b = false; + + // select new features + for (auto& term : terms) + selected_[term] = true; } bool feature_selector::selected(term_id term) const { - return false; + return selected_[term]; } void feature_selector::select_percent(double p /* = 0.05 */) @@ -131,7 +148,7 @@ void feature_selector::print_summary(uint64_t k /* = 20 */) const { term_id tid; double score; - for (uint64_t lbl = 0; lbl < class_prob_.size(); ++lbl) + for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) { std::cout << std::endl << "Top " << k << " features for \"" << idx_->class_label_from_id(label_id{lbl + 1}) diff --git a/src/features/tools/feature_summary.cpp b/src/features/tools/feature_summary.cpp index 872be2942..0d1348665 100644 --- a/src/features/tools/feature_summary.cpp +++ b/src/features/tools/feature_summary.cpp @@ -10,7 +10,7 @@ #include #include "cpptoml.h" #include "util/shim.h" -#include "features/information_gain.h" +#include "features/all.h" #include "index/forward_index.h" using namespace meta; @@ -35,6 +35,8 @@ int main(int argc, char* argv[]) auto f_idx = index::make_index(argv[1]); auto f_prefix = feature_config->get_as("prefix"); + //features::chi_square selector{*f_prefix, f_idx}; features::information_gain selector{*f_prefix, f_idx}; - selector.print_summary(); + //selector.select(100); + selector.print_summary(10); } From 418241161fd6923f1fa1313a3bc5e48c4860c356 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 25 Nov 2014 13:40:14 -0600 Subject: [PATCH 036/481] make_feature_selector initial version --- include/features/chi_square.h | 15 +++--- include/features/feature_selector.h | 24 ++++++++-- include/features/information_gain.h | 15 +++--- include/features/make_feature_selector.h | 58 ++++++++++++++++++++++++ src/features/tools/feature_summary.cpp | 12 +++-- 5 files changed, 99 insertions(+), 25 deletions(-) create mode 100644 include/features/make_feature_selector.h diff --git a/include/features/chi_square.h b/include/features/chi_square.h index 950c47410..a5bfc7ffc 100644 --- a/include/features/chi_square.h +++ b/include/features/chi_square.h @@ -26,16 +26,15 @@ namespace features class chi_square : public feature_selector { public: + /// Inherit constructor. + using feature_selector::feature_selector; + /** - * Constructor. + * This feature_selector is a friend of the factory method used to create it */ - chi_square(const std::string& prefix, - std::shared_ptr idx, - uint64_t features_per_class = 20) - : feature_selector{prefix + ".chi", std::move(idx)} - { - init(features_per_class); - } + template + friend std::shared_ptr + make_selector(const std::string&, ForwardIndex, Args&&...); /** * Scores the (label_id, term) pair according to this feature selection diff --git a/include/features/feature_selector.h b/include/features/feature_selector.h index 94f5e7605..a0e084b07 100644 --- a/include/features/feature_selector.h +++ b/include/features/feature_selector.h @@ -14,6 +14,7 @@ #include #include +#include "features/make_feature_selector.h" #include "util/disk_vector.h" #include "index/forward_index.h" @@ -30,11 +31,11 @@ class feature_selector { public: /** - * @param filename - * @param idx + * This feature_selector is a friend of the factory method used to create it */ - feature_selector(const std::string& prefix, - std::shared_ptr idx); + template + friend std::shared_ptr + make_selector(const std::string&, ForwardIndex, Args&&...); /** * Default destructor. @@ -68,6 +69,13 @@ class feature_selector virtual void select_percent(double p = 0.05); protected: + /** + * @param config + * @param idx + */ + feature_selector(const std::string& prefix, + std::shared_ptr idx); + /** * Creates the state of this feature_selector if necessary; this logic is * outside the constructor since it requires pure virtual functions @@ -172,6 +180,14 @@ class feature_selector /// P(c,t) indexed by [label_id][term_id] std::vector> co_occur_; }; +/** + * Basic exception for feature selectors. + */ +class feature_selector_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; } } diff --git a/include/features/information_gain.h b/include/features/information_gain.h index b2d72ff21..0f85c8401 100644 --- a/include/features/information_gain.h +++ b/include/features/information_gain.h @@ -25,16 +25,15 @@ namespace features class information_gain : public feature_selector { public: + /// Inherit constructor. + using feature_selector::feature_selector; + /** - * Constructor. + * This feature_selector is a friend of the factory method used to create it */ - information_gain(const std::string& prefix, - std::shared_ptr idx, - uint64_t features_per_class = 20) - : feature_selector{prefix + ".ig", std::move(idx)} - { - init(features_per_class); - } + template + friend std::shared_ptr + make_selector(const std::string&, ForwardIndex, Args&&...); /** * Scores the (label, term) pair according to this feature selection diff --git a/include/features/make_feature_selector.h b/include/features/make_feature_selector.h new file mode 100644 index 000000000..149cba915 --- /dev/null +++ b/include/features/make_feature_selector.h @@ -0,0 +1,58 @@ +/** + * @file make_feature_selector.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_MAKE_FEATURE_SELECTOR_H_ +#define META_MAKE_FEATURE_SELECTOR_H_ + +#include + +#include "cpptoml.h" +#include "features/feature_selector.h" + +namespace meta +{ +namespace features +{ +/** + * Factory method for creating feature selection algorithms. + * @param config_file The path to the configuration file to create selector + * @param fwd_idx The forward_index to perform feature selection on + * @param args any additional arguments to forward to the constructor (usually + * none) + * @return a properly initialized feature_selector + */ +template +std::shared_ptr make_selector(const std::string& config_file, + ForwardIndex fwd_idx, Args&&... args) +{ + auto config = cpptoml::parse_file(config_file); + auto group = config.get_group("features"); + if (!group) + throw std::runtime_error{"[features] group missing from config file"}; + + auto prefix = group->get_as("prefix"); + if (!prefix) + throw std::runtime_error{"no prefix in [features] group"}; + + // can't use make_shared since Selector constructors are private + auto selector = std::shared_ptr{ + new Selector(*prefix, std::move(fwd_idx), std::forward(args)...)}; + + uint64_t features_per_class = 20; + auto num_features = group->get_as("features-per-class"); + if (num_features) + features_per_class = *num_features; + + selector->init(features_per_class); + return selector; +} +} +} + +#endif diff --git a/src/features/tools/feature_summary.cpp b/src/features/tools/feature_summary.cpp index 0d1348665..19395a7ee 100644 --- a/src/features/tools/feature_summary.cpp +++ b/src/features/tools/feature_summary.cpp @@ -11,6 +11,7 @@ #include "cpptoml.h" #include "util/shim.h" #include "features/all.h" +#include "features/make_feature_selector.h" #include "index/forward_index.h" using namespace meta; @@ -34,9 +35,10 @@ int main(int argc, char* argv[]) } auto f_idx = index::make_index(argv[1]); - auto f_prefix = feature_config->get_as("prefix"); - //features::chi_square selector{*f_prefix, f_idx}; - features::information_gain selector{*f_prefix, f_idx}; - //selector.select(100); - selector.print_summary(10); + auto selector + = features::make_selector(argv[1], f_idx); + // auto selector = features::make_selector{argv[1], + // f_idx}; + selector->select(100); + selector->print_summary(10); } From dd1f9e0bde1fd82597bd7f9b90169a270331621c Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 25 Nov 2014 15:06:21 -0600 Subject: [PATCH 037/481] fix new compiler warning --- src/features/feature_selector.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp index d7bde533f..8de0fdb86 100644 --- a/src/features/feature_selector.cpp +++ b/src/features/feature_selector.cpp @@ -53,8 +53,8 @@ void feature_selector::score_all() { prog(tid); for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) - scores[lbl][tid] - = std::make_pair(tid, score(label_id{lbl + 1}, term_id{tid})); + scores[lbl][tid] = std::make_pair( + tid, score(static_cast(lbl + 1), term_id{tid})); } prog.end(); @@ -66,6 +66,7 @@ void feature_selector::score_all() }); }); + for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) { // write (term_id, score) pairs @@ -151,7 +152,7 @@ void feature_selector::print_summary(uint64_t k /* = 20 */) const for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) { std::cout << std::endl << "Top " << k << " features for \"" - << idx_->class_label_from_id(label_id{lbl + 1}) + << idx_->class_label_from_id(static_cast(lbl + 1)) << "\":" << std::endl << "===============================" << std::endl; From 79867f407febc4b19f6a25cdf832263cc51cd1b5 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 29 Nov 2014 13:47:39 -0600 Subject: [PATCH 038/481] create factory for feature selectors downside: constructors are now public --- include/features/chi_square.h | 23 +---- include/features/feature_selector.h | 31 +++---- include/features/information_gain.h | 37 +------- include/features/make_feature_selector.h | 58 ------------- include/features/selector_factory.h | 105 +++++++++++++++++++++++ src/features/CMakeLists.txt | 5 +- src/features/chi_square.cpp | 29 +++++++ src/features/feature_selector.cpp | 4 +- src/features/information_gain.cpp | 44 ++++++++++ src/features/selector_factory.cpp | 57 ++++++++++++ src/features/tools/feature_summary.cpp | 7 +- 11 files changed, 261 insertions(+), 139 deletions(-) delete mode 100644 include/features/make_feature_selector.h create mode 100644 include/features/selector_factory.h create mode 100644 src/features/chi_square.cpp create mode 100644 src/features/information_gain.cpp create mode 100644 src/features/selector_factory.cpp diff --git a/include/features/chi_square.h b/include/features/chi_square.h index a5bfc7ffc..ff7a1d3e8 100644 --- a/include/features/chi_square.h +++ b/include/features/chi_square.h @@ -29,12 +29,8 @@ class chi_square : public feature_selector /// Inherit constructor. using feature_selector::feature_selector; - /** - * This feature_selector is a friend of the factory method used to create it - */ - template - friend std::shared_ptr - make_selector(const std::string&, ForwardIndex, Args&&...); + /// Identifier for this feature_selector. + const static std::string id; /** * Scores the (label_id, term) pair according to this feature selection @@ -42,20 +38,7 @@ class chi_square : public feature_selector * @param lid * @param tid */ - virtual double score(label_id lid, term_id tid) const override - { - double p_tc = term_and_class(tid, lid); - double p_ntnc = not_term_and_not_class(tid, lid); - double p_ntc = not_term_and_class(tid, lid); - double p_tnc = term_and_not_class(tid, lid); - double p_c = prob_class(lid); - double p_t = prob_term(tid); - - double numerator = p_tc * p_ntnc - p_ntc * p_tnc; - double denominator = p_c * (1.0 - p_c) * p_t * (1.0 - p_t); - - return (numerator * numerator) / denominator; - } + virtual double score(label_id lid, term_id tid) const override; }; } } diff --git a/include/features/feature_selector.h b/include/features/feature_selector.h index a0e084b07..480b6709a 100644 --- a/include/features/feature_selector.h +++ b/include/features/feature_selector.h @@ -31,11 +31,19 @@ class feature_selector { public: /** - * This feature_selector is a friend of the factory method used to create it + * @param config + * @param idx + */ + feature_selector(const std::string& prefix, + std::shared_ptr idx); + + /** + * Creates the state of this feature_selector if necessary; this logic is + * outside the constructor since it requires pure virtual functions + * implemented by deriving classes. + * @param features_per_class */ - template - friend std::shared_ptr - make_selector(const std::string&, ForwardIndex, Args&&...); + void init(uint64_t features_per_class); /** * Default destructor. @@ -69,21 +77,6 @@ class feature_selector virtual void select_percent(double p = 0.05); protected: - /** - * @param config - * @param idx - */ - feature_selector(const std::string& prefix, - std::shared_ptr idx); - - /** - * Creates the state of this feature_selector if necessary; this logic is - * outside the constructor since it requires pure virtual functions - * implemented by deriving classes. - * @param features_per_class - */ - void init(uint64_t features_per_class); - /** * Scores a (label, term) pair in the index according to the derived class's * feature selection method diff --git a/include/features/information_gain.h b/include/features/information_gain.h index 0f85c8401..a39692b10 100644 --- a/include/features/information_gain.h +++ b/include/features/information_gain.h @@ -28,12 +28,8 @@ class information_gain : public feature_selector /// Inherit constructor. using feature_selector::feature_selector; - /** - * This feature_selector is a friend of the factory method used to create it - */ - template - friend std::shared_ptr - make_selector(const std::string&, ForwardIndex, Args&&...); + /// Identifier for this feature_selector. + const static std::string id; /** * Scores the (label, term) pair according to this feature selection @@ -41,34 +37,7 @@ class information_gain : public feature_selector * @param lid * @param tid */ - virtual double score(label_id lid, term_id tid) const override - { - double p_tc = term_and_class(tid, lid); - double p_ntnc = not_term_and_not_class(tid, lid); - double p_ntc = not_term_and_class(tid, lid); - double p_tnc = term_and_not_class(tid, lid); - double p_c = prob_class(lid); - double p_t = prob_term(tid); - double p_nc = 1.0 - p_c; - double p_nt = 1.0 - p_t; - - double gain_tc = p_tc * std::log(p_tc / (p_t * p_c)); - double gain_ntnc = p_ntnc * std::log(p_ntnc / (p_nt * p_nc)); - double gain_ntc = p_ntc * std::log(p_ntc / (p_nt * p_c)); - double gain_tnc = p_tnc * std::log(p_tnc / (p_t * p_nc)); - - // if any denominators were zero, make the expression zero - if (std::isnan(gain_tc)) - gain_tc = 0.0; - if (std::isnan(gain_ntnc)) - gain_ntnc = 0.0; - if (std::isnan(gain_ntc)) - gain_ntc = 0.0; - if (std::isnan(gain_tnc)) - gain_tnc = 0.0; - - return gain_tc + gain_ntnc + gain_ntc + gain_tnc; - } + virtual double score(label_id lid, term_id tid) const override; }; } } diff --git a/include/features/make_feature_selector.h b/include/features/make_feature_selector.h deleted file mode 100644 index 149cba915..000000000 --- a/include/features/make_feature_selector.h +++ /dev/null @@ -1,58 +0,0 @@ -/** - * @file make_feature_selector.h - * @author Sean Massung - * - * All files in META are dual-licensed under the MIT and NCSA licenses. For more - * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the - * project. - */ - -#ifndef META_MAKE_FEATURE_SELECTOR_H_ -#define META_MAKE_FEATURE_SELECTOR_H_ - -#include - -#include "cpptoml.h" -#include "features/feature_selector.h" - -namespace meta -{ -namespace features -{ -/** - * Factory method for creating feature selection algorithms. - * @param config_file The path to the configuration file to create selector - * @param fwd_idx The forward_index to perform feature selection on - * @param args any additional arguments to forward to the constructor (usually - * none) - * @return a properly initialized feature_selector - */ -template -std::shared_ptr make_selector(const std::string& config_file, - ForwardIndex fwd_idx, Args&&... args) -{ - auto config = cpptoml::parse_file(config_file); - auto group = config.get_group("features"); - if (!group) - throw std::runtime_error{"[features] group missing from config file"}; - - auto prefix = group->get_as("prefix"); - if (!prefix) - throw std::runtime_error{"no prefix in [features] group"}; - - // can't use make_shared since Selector constructors are private - auto selector = std::shared_ptr{ - new Selector(*prefix, std::move(fwd_idx), std::forward(args)...)}; - - uint64_t features_per_class = 20; - auto num_features = group->get_as("features-per-class"); - if (num_features) - features_per_class = *num_features; - - selector->init(features_per_class); - return selector; -} -} -} - -#endif diff --git a/include/features/selector_factory.h b/include/features/selector_factory.h new file mode 100644 index 000000000..8a455f6d8 --- /dev/null +++ b/include/features/selector_factory.h @@ -0,0 +1,105 @@ +/** + * @file selector_factory.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_FEATURE_SELECTOR_FACTORY_H_ +#define META_FEATURE_SELECTOR_FACTORY_H_ + +#include "features/feature_selector.h" +#include "util/factory.h" +#include "util/shim.h" + +namespace cpptoml +{ +class toml_group; +} + +namespace meta +{ +namespace features +{ + +/** + * Factory that is responsible for creating selectors from configuration + * files. Clients should use the register_selector method instead of this + * class directly to add their own selectors. + */ +class selector_factory + : public util::factory> +{ + friend base_factory; + + private: + /** + * Constructs the selector_factory singleton. + */ + selector_factory(); + + /** + * Registers a single-index selector. Used internally. + */ + template + void reg(); +}; + +/** + * Convenience method for creating a selector using the factory. + * + * @param config The configuration group that specifies the configuration + * for the selector to be created + * @param idx The forward_index to be passed to the selector being + * created + * + * @return a unique_ptr to the selector created from the given + * configuration + */ +std::unique_ptr + make_selector(const cpptoml::toml_group& config, + std::shared_ptr idx); + +/** + * Factory method for creating a ranker. This should be specialized if + * your given ranker requires special construction behavior (e.g., + * reading parameters). + */ +template +std::unique_ptr + make_selector(const cpptoml::toml_group& config, + std::shared_ptr idx) +{ + auto prefix = config.get_as("prefix"); + if (!prefix) + throw selector_factory::exception{"no prefix in [features] group"}; + + auto method = config.get_as("method"); + if (!method) + throw selector_factory::exception{ + "feature selection method required in [features] group"}; + + return make_unique(*prefix + "." + *method, std::move(idx)); +} + +/** + * Registration method for selectors. Clients should use this method to + * register any new selectors they write. + */ +template +void register_selector() +{ + selector_factory::get().add(Selector::id, + [](const cpptoml::toml_group& config, + std::shared_ptr idx) + { + return make_selector(config, std::move(idx)); + }); +} +} +} +#endif diff --git a/src/features/CMakeLists.txt b/src/features/CMakeLists.txt index a45c01be1..de541b895 100644 --- a/src/features/CMakeLists.txt +++ b/src/features/CMakeLists.txt @@ -2,5 +2,8 @@ project(meta-features) add_subdirectory(tools) -add_library(meta-features feature_selector.cpp) +add_library(meta-features feature_selector.cpp + selector_factory.cpp + chi_square.cpp + information_gain.cpp) target_link_libraries(meta-features meta-index ${CMAKE_THREAD_LIBS_INIT}) diff --git a/src/features/chi_square.cpp b/src/features/chi_square.cpp new file mode 100644 index 000000000..1fd4f9fb7 --- /dev/null +++ b/src/features/chi_square.cpp @@ -0,0 +1,29 @@ +/** + * @file chi_square.cpp + * @author Sean Massung + */ + +#include "features/chi_square.h" + +namespace meta +{ +namespace features +{ +const std::string chi_square::id = "chi-square"; + +double chi_square::score(label_id lid, term_id tid) const +{ + double p_tc = term_and_class(tid, lid); + double p_ntnc = not_term_and_not_class(tid, lid); + double p_ntc = not_term_and_class(tid, lid); + double p_tnc = term_and_not_class(tid, lid); + double p_c = prob_class(lid); + double p_t = prob_term(tid); + + double numerator = p_tc * p_ntnc - p_ntc * p_tnc; + double denominator = p_c * (1.0 - p_c) * p_t * (1.0 - p_t); + + return (numerator * numerator) / denominator; +} +} +} diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp index d7bde533f..aef3ef4e8 100644 --- a/src/features/feature_selector.cpp +++ b/src/features/feature_selector.cpp @@ -54,7 +54,7 @@ void feature_selector::score_all() prog(tid); for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) scores[lbl][tid] - = std::make_pair(tid, score(label_id{lbl + 1}, term_id{tid})); + = std::make_pair(tid, score(static_cast(lbl + 1), term_id{tid})); } prog.end(); @@ -151,7 +151,7 @@ void feature_selector::print_summary(uint64_t k /* = 20 */) const for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) { std::cout << std::endl << "Top " << k << " features for \"" - << idx_->class_label_from_id(label_id{lbl + 1}) + << idx_->class_label_from_id(static_cast(lbl + 1)) << "\":" << std::endl << "===============================" << std::endl; diff --git a/src/features/information_gain.cpp b/src/features/information_gain.cpp new file mode 100644 index 000000000..b101bbfcc --- /dev/null +++ b/src/features/information_gain.cpp @@ -0,0 +1,44 @@ +/** + * @file information_gain.cpp + * @author Sean Massung + */ + +#include "features/information_gain.h" + +namespace meta +{ +namespace features +{ + +const std::string information_gain::id = "info-gain"; + +double information_gain::score(label_id lid, term_id tid) const +{ + double p_tc = term_and_class(tid, lid); + double p_ntnc = not_term_and_not_class(tid, lid); + double p_ntc = not_term_and_class(tid, lid); + double p_tnc = term_and_not_class(tid, lid); + double p_c = prob_class(lid); + double p_t = prob_term(tid); + double p_nc = 1.0 - p_c; + double p_nt = 1.0 - p_t; + + double gain_tc = p_tc * std::log(p_tc / (p_t * p_c)); + double gain_ntnc = p_ntnc * std::log(p_ntnc / (p_nt * p_nc)); + double gain_ntc = p_ntc * std::log(p_ntc / (p_nt * p_c)); + double gain_tnc = p_tnc * std::log(p_tnc / (p_t * p_nc)); + + // if any denominators were zero, make the expression zero + if (std::isnan(gain_tc)) + gain_tc = 0.0; + if (std::isnan(gain_ntnc)) + gain_ntnc = 0.0; + if (std::isnan(gain_ntc)) + gain_ntc = 0.0; + if (std::isnan(gain_tnc)) + gain_tnc = 0.0; + + return gain_tc + gain_ntnc + gain_ntc + gain_tnc; +} +} +} diff --git a/src/features/selector_factory.cpp b/src/features/selector_factory.cpp new file mode 100644 index 000000000..a4564a080 --- /dev/null +++ b/src/features/selector_factory.cpp @@ -0,0 +1,57 @@ +/** + * @file selector_factory.cpp + * @author Sean Massung + */ + +#include "cpptoml.h" +#include "features/all.h" +#include "features/selector_factory.h" + +namespace meta +{ +namespace features +{ + +template +void selector_factory::reg() +{ + add(Selector::id, make_selector); +} + +selector_factory::selector_factory() +{ + // built-in feature-selection algorithms + reg(); + reg(); +} + +std::unique_ptr + make_selector(const cpptoml::toml_group& config, + std::shared_ptr idx) +{ + auto group = config.get_group("features"); + if (!group) + throw selector_factory::exception{ + "[features] group missing from config file"}; + + auto prefix = group->get_as("prefix"); + if (!prefix) + throw selector_factory::exception{"no prefix in [features] group"}; + + auto method = group->get_as("method"); + if (!method) + throw selector_factory::exception{ + "feature selection method required in [features] group"}; + + uint64_t features_per_class = 20; + auto num_features = group->get_as("features-per-class"); + if (num_features) + features_per_class = *num_features; + + auto selector + = selector_factory::get().create(*method, *group, std::move(idx)); + selector->init(features_per_class); + return selector; +} +} +} diff --git a/src/features/tools/feature_summary.cpp b/src/features/tools/feature_summary.cpp index 19395a7ee..2ce233e5c 100644 --- a/src/features/tools/feature_summary.cpp +++ b/src/features/tools/feature_summary.cpp @@ -11,7 +11,7 @@ #include "cpptoml.h" #include "util/shim.h" #include "features/all.h" -#include "features/make_feature_selector.h" +#include "features/selector_factory.h" #include "index/forward_index.h" using namespace meta; @@ -35,10 +35,7 @@ int main(int argc, char* argv[]) } auto f_idx = index::make_index(argv[1]); - auto selector - = features::make_selector(argv[1], f_idx); - // auto selector = features::make_selector{argv[1], - // f_idx}; + auto selector = features::make_selector(config, f_idx); selector->select(100); selector->print_summary(10); } From b9427495f6f78121da218ab8af223e3bef1ef033 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 29 Nov 2014 15:38:20 -0600 Subject: [PATCH 039/481] update config file for feature selection params --- config.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config.toml b/config.toml index 64f37de3a..671f42403 100644 --- a/config.toml +++ b/config.toml @@ -48,4 +48,6 @@ dev-sections = [19, 21] test-sections = [22, 24] [features] +method = "info-gain" prefix = "features" +features-per-class = 20 From 0125204677b0fffeac60d4c89b3c17bd9e005348 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 29 Nov 2014 16:07:32 -0600 Subject: [PATCH 040/481] fix bugs in feature_selector --- include/features/feature_selector.h | 1 - src/features/feature_selector.cpp | 34 ++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/include/features/feature_selector.h b/include/features/feature_selector.h index 480b6709a..f7ad78b9b 100644 --- a/include/features/feature_selector.h +++ b/include/features/feature_selector.h @@ -14,7 +14,6 @@ #include #include -#include "features/make_feature_selector.h" #include "util/disk_vector.h" #include "index/forward_index.h" diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp index aef3ef4e8..378d20713 100644 --- a/src/features/feature_selector.cpp +++ b/src/features/feature_selector.cpp @@ -62,7 +62,7 @@ void feature_selector::score_all() { std::sort(v.begin(), v.end(), [&](const auto& a, const auto& b) { - return a.second < b.second; + return a.second > b.second; }); }); @@ -134,7 +134,7 @@ void feature_selector::calc_probs() prog.end(); for (auto& p : class_prob_) - p /= idx_->num_labels(); + p /= idx_->num_docs(); for (auto& p : term_prob_) p /= total_terms; @@ -169,34 +169,52 @@ void feature_selector::print_summary(uint64_t k /* = 20 */) const double feature_selector::prob_term(term_id id) const { - return term_prob_.at(id); + auto p = term_prob_.at(id); + if(p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + return p; } double feature_selector::prob_class(label_id id) const { - return class_prob_.at(id - 1); + auto p = class_prob_.at(id - 1); + if(p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + return p; } double feature_selector::term_and_class(term_id term, label_id label) const { - return co_occur_.at(label - 1).at(term); + auto p = co_occur_.at(label - 1).at(term); + if(p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + return p; } double feature_selector::not_term_and_not_class(term_id term, label_id label) const { - return 1.0 - term_and_class(term, label) - not_term_and_class(term, label) + auto p = 1.0 - term_and_class(term, label) - not_term_and_class(term, label) - term_and_not_class(term, label); + if(p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + return p; } double feature_selector::term_and_not_class(term_id term, label_id label) const { - return term_prob_.at(term) - term_and_class(term, label); + auto p = term_prob_.at(term) - term_and_class(term, label); + if(p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + return p; } double feature_selector::not_term_and_class(term_id term, label_id label) const { - return class_prob_.at(label - 1) - term_and_class(term, label); + auto p = class_prob_.at(label - 1) - term_and_class(term, label); + if(p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + return p; } } } From 2ecd32a69921a889f10518fe1a3442f07c932f1e Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 29 Nov 2014 16:07:42 -0600 Subject: [PATCH 041/481] add correlation_coefficient feature selection --- include/features/all.h | 1 + include/features/correlation_coefficient.h | 45 ++++++++++++++++++++++ src/features/CMakeLists.txt | 1 + src/features/correlation_coefficient.cpp | 33 ++++++++++++++++ src/features/selector_factory.cpp | 1 + 5 files changed, 81 insertions(+) create mode 100644 include/features/correlation_coefficient.h create mode 100644 src/features/correlation_coefficient.cpp diff --git a/include/features/all.h b/include/features/all.h index 0f121b77b..7bcdce320 100644 --- a/include/features/all.h +++ b/include/features/all.h @@ -1,2 +1,3 @@ #include "features/chi_square.h" #include "features/information_gain.h" +#include "features/correlation_coefficient.h" diff --git a/include/features/correlation_coefficient.h b/include/features/correlation_coefficient.h new file mode 100644 index 000000000..f166598f0 --- /dev/null +++ b/include/features/correlation_coefficient.h @@ -0,0 +1,45 @@ +/** + * @file correlation_coefficient.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_CORRELATION_COEFFICIENT_H_ +#define META_CORRELATION_COEFFICIENT_H_ + +#include "features/feature_selector.h" + +namespace meta +{ +namespace features +{ +/** + * Performs correlation coefficient feature selection: + * \f$ \chi^2(t, c_i) = + * \frac{(P(t,c_i) P(\overline{t}, \overline{c_i}) - P(t, \overline{c_i}) + * P(\overline{t},c_i))^2} + * {P(t) P(\overline{t}) P(c_i) P(\overline{c_i})} \f$ + */ +class correlation_coefficient : public feature_selector +{ + public: + /// Inherit constructor. + using feature_selector::feature_selector; + + /// Identifier for this feature_selector. + const static std::string id; + + /** + * Scores the (label_id, term) pair according to this feature selection + * metric. + * @param lid + * @param tid + */ + virtual double score(label_id lid, term_id tid) const override; +}; +} +} +#endif diff --git a/src/features/CMakeLists.txt b/src/features/CMakeLists.txt index de541b895..e87998125 100644 --- a/src/features/CMakeLists.txt +++ b/src/features/CMakeLists.txt @@ -5,5 +5,6 @@ add_subdirectory(tools) add_library(meta-features feature_selector.cpp selector_factory.cpp chi_square.cpp + correlation_coefficient.cpp information_gain.cpp) target_link_libraries(meta-features meta-index ${CMAKE_THREAD_LIBS_INIT}) diff --git a/src/features/correlation_coefficient.cpp b/src/features/correlation_coefficient.cpp new file mode 100644 index 000000000..269f297e7 --- /dev/null +++ b/src/features/correlation_coefficient.cpp @@ -0,0 +1,33 @@ +/** + * @file correlation_coefficient.cpp + * @author Sean Massung + */ + +#include "features/correlation_coefficient.h" + +namespace meta +{ +namespace features +{ +const std::string correlation_coefficient::id = "corr-coef"; + +double correlation_coefficient::score(label_id lid, term_id tid) const +{ + double p_tc = term_and_class(tid, lid); + double p_ntnc = not_term_and_not_class(tid, lid); + double p_ntc = not_term_and_class(tid, lid); + double p_tnc = term_and_not_class(tid, lid); + double p_c = prob_class(lid); + double p_t = prob_term(tid); + + double numerator = p_tc * p_ntnc - p_ntc * p_tnc; + double denominator = p_c * (1.0 - p_c) * p_t * (1.0 - p_t); + + // avoid divide by zero + if (denominator == 0.0) + return 0.0; + + return numerator / std::sqrt(denominator); +} +} +} diff --git a/src/features/selector_factory.cpp b/src/features/selector_factory.cpp index a4564a080..92d913522 100644 --- a/src/features/selector_factory.cpp +++ b/src/features/selector_factory.cpp @@ -23,6 +23,7 @@ selector_factory::selector_factory() // built-in feature-selection algorithms reg(); reg(); + reg(); } std::unique_ptr From 77de6a62d6f01c2d8efd441eb10199f0393f749f Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 29 Nov 2014 16:09:14 -0600 Subject: [PATCH 042/481] formula for correlation coefficient --- include/features/correlation_coefficient.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/features/correlation_coefficient.h b/include/features/correlation_coefficient.h index f166598f0..6c8bc9bcf 100644 --- a/include/features/correlation_coefficient.h +++ b/include/features/correlation_coefficient.h @@ -18,10 +18,10 @@ namespace features { /** * Performs correlation coefficient feature selection: - * \f$ \chi^2(t, c_i) = - * \frac{(P(t,c_i) P(\overline{t}, \overline{c_i}) - P(t, \overline{c_i}) - * P(\overline{t},c_i))^2} - * {P(t) P(\overline{t}) P(c_i) P(\overline{c_i})} \f$ + * \f$ CC(t, c_i) = + * \frac{P(t,c_i) P(\overline{t}, \overline{c_i}) - P(t, \overline{c_i}) + * P(\overline{t},c_i)} + * {\sqrt{P(t) P(\overline{t}) P(c_i) P(\overline{c_i})}} \f$ */ class correlation_coefficient : public feature_selector { From a54da837d4b723801354e2f202fb13708b8a2e71 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 5 Dec 2014 12:25:21 -0600 Subject: [PATCH 043/481] tools file to create a new line corpus dataset based on output of topic model --- src/topics/tools/CMakeLists.txt | 3 + src/topics/tools/topic_corpus.cpp | 96 +++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 src/topics/tools/topic_corpus.cpp diff --git a/src/topics/tools/CMakeLists.txt b/src/topics/tools/CMakeLists.txt index 3b525916d..1dac3ae67 100644 --- a/src/topics/tools/CMakeLists.txt +++ b/src/topics/tools/CMakeLists.txt @@ -3,3 +3,6 @@ target_link_libraries(lda meta-topics) add_executable(lda-topics lda-topics.cpp) target_link_libraries(lda-topics meta-index) + +add_executable(topic-corpus topic_corpus.cpp) +target_link_libraries(topic-corpus meta-topics) diff --git a/src/topics/tools/topic_corpus.cpp b/src/topics/tools/topic_corpus.cpp new file mode 100644 index 000000000..483ec649a --- /dev/null +++ b/src/topics/tools/topic_corpus.cpp @@ -0,0 +1,96 @@ +/** + * @file topic_corpus.cpp + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#include +#include +#include +#include +#include "cpptoml.h" +#include "util/filesystem.h" + +using namespace meta; + +int print_usage(const std::string& name) +{ + std::cerr + << "Usage: " << name + << " config_file model.theta\n" + "\tCreates a line_corpus dataset based on the topics from an LDA run" + << std::endl; + return 1; +} + +std::vector get_topic_ids(std::ifstream& thetas) +{ + std::vector topic_ids; + std::string line; + while (thetas) + { + std::getline(thetas, line); + if (line.empty()) + continue; + std::istringstream stream{line}; + size_t did; + stream >> did; + + std::string to_split; + size_t best_topic = 0; + double best_prob = 0; + while (stream) + { + stream >> to_split; + if (to_split.length() == 0) + continue; + size_t idx = to_split.find_first_of(':'); + size_t topic{std::stoul(to_split.substr(0, idx))}; + double prob = std::stod(to_split.substr(idx + 1)); + if (prob > best_prob) + { + best_topic = topic; + best_prob = prob; + } + } + topic_ids.push_back(best_topic); + } + std::cout << "Found " << topic_ids.size() << " documents." << std::endl; + return topic_ids; +} + +void create_topic_corpus(const std::string& prefix, const std::string& dataset, + std::ifstream& thetas) +{ + auto topic_ids = get_topic_ids(thetas); + auto new_file = prefix + "/" + dataset + "/" + dataset + + "-topics.dat.labels"; + std::ofstream out_labels{new_file}; + for (auto& topic : topic_ids) + out_labels << "t" << topic << std::endl; + std::cout << "Saved new labels file: " << new_file << std::endl; +} + +int main(int argc, char* argv[]) +{ + if (argc != 3) + return print_usage(argv[0]); + + auto config = cpptoml::parse_file(argv[1]); + + auto ctype = *config.get_as("corpus-type"); + if (ctype != "line-corpus") + { + std::cerr << "Currently only line_corpus format is supported!" + << std::endl; + return 1; + } + + auto prefix = *config.get_as("prefix"); + auto dataset = *config.get_as("dataset"); + std::ifstream thetas{argv[2]}; + create_topic_corpus(prefix, dataset, thetas); +} From b024d08107d55c0d31d52d7a2c478a0757d24784 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 8 Dec 2014 10:59:25 -0600 Subject: [PATCH 044/481] standardize config format for lm-test and diff_analyzer; give example config settings in config.toml --- config.toml | 10 ++++++++++ src/analyzers/diff_analyzer.cpp | 6 +----- src/lm/diff.cpp | 12 ++++++++---- src/lm/language_model.cpp | 7 ++----- src/lm/tools/create-dataset.cpp | 2 +- 5 files changed, 22 insertions(+), 15 deletions(-) diff --git a/config.toml b/config.toml index 671f42403..00c29b92a 100644 --- a/config.toml +++ b/config.toml @@ -47,6 +47,16 @@ train-sections = [0, 18] dev-sections = [19, 21] test-sections = [22, 24] +[diff] +prefix = "../data" +dataset = "20newsgroups" +n-value = 3 +max-edits = 3 + +[language-model] +format = "learn" +n-value = 3 + [features] method = "info-gain" prefix = "features" diff --git a/src/analyzers/diff_analyzer.cpp b/src/analyzers/diff_analyzer.cpp index 109cf7a02..4ae0dbb66 100644 --- a/src/analyzers/diff_analyzer.cpp +++ b/src/analyzers/diff_analyzer.cpp @@ -74,11 +74,7 @@ std::unique_ptr const cpptoml::toml_group& config) { auto filts = analyzer::load_filters(global, config); - auto diff_config = global.get_group("diff-config"); - if (!diff_config) - throw analyzer::analyzer_exception{ - "diff-config section needed for diff analyzer"}; - return make_unique(*diff_config, std::move(filts)); + return make_unique(global, std::move(filts)); } } } diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 1ce3328e2..1c60e047e 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -16,18 +16,22 @@ namespace lm { diff::diff(const cpptoml::toml_group& config) : lm_{config} { - auto nval = config.get_as("n-value"); + auto group = config.get_group("diff"); + if (!group) + throw diff_exception{"missing [diff] group from config"}; + + auto nval = group->get_as("n-value"); if (!nval) throw diff_exception{"n-value not specified in config"}; n_val_ = *nval; - auto edits = config.get_as("max-edits"); + auto edits = group->get_as("max-edits"); if (!edits) throw diff_exception{"max-edits not specified in config"}; max_edits_ = *edits; - set_stems(config); - set_function_words(config); + set_stems(*group); + set_function_words(*group); } std::vector> diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 6a20cb677..e3aa3f890 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -265,15 +265,12 @@ double language_model::perplexity(const sentence& tokens) const { sentence ngram; for (size_t i = 1; i < N_; ++i) - // ngram.push_back(tokens[i - 1]); - ngram.push_back(""); + ngram.push_back(""); double perp = 0.0; - for (auto& token : tokens) - //for (size_t i = N_; i < tokens.size(); ++i) + for (auto& token : tokens) { ngram.push_back(token); - //ngram.push_back(tokens[i]); perp += std::log(1.0 / prob(ngram)); ngram.pop_front(); } diff --git a/src/lm/tools/create-dataset.cpp b/src/lm/tools/create-dataset.cpp index 57088c94b..94ee9fd4f 100644 --- a/src/lm/tools/create-dataset.cpp +++ b/src/lm/tools/create-dataset.cpp @@ -16,7 +16,7 @@ int main(int argc, char* argv[]) { bool diagnostic = true; auto config = cpptoml::parse_file(argv[1]); - lm::diff correcter{*config.get_group("diff-config")}; + lm::diff correcter{*config.get_group("diff")}; std::string line; std::ifstream in{argv[2]}; std::ofstream out{"edits.dat"}; From 62d85ec0dd88dae5002a002bd0e9f8414c82f3f8 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 8 Dec 2014 14:32:01 -0600 Subject: [PATCH 045/481] create blank_filter --- include/analyzers/filters/blank_filter.h | 84 ++++++++++++++++++++++++ include/utf/utf.h | 7 ++ src/analyzers/analyzer.cpp | 1 + src/analyzers/filters/CMakeLists.txt | 1 + src/analyzers/filters/blank_filter.cpp | 76 +++++++++++++++++++++ src/analyzers/filters/filter_factory.cpp | 2 + src/utf/utf.cpp | 5 ++ 7 files changed, 176 insertions(+) create mode 100644 include/analyzers/filters/blank_filter.h create mode 100644 src/analyzers/filters/blank_filter.cpp diff --git a/include/analyzers/filters/blank_filter.h b/include/analyzers/filters/blank_filter.h new file mode 100644 index 000000000..3011f35e2 --- /dev/null +++ b/include/analyzers/filters/blank_filter.h @@ -0,0 +1,84 @@ +/** + * @file blank_filter.h + * @author Sean Massung + * + * All files in META are released under the MIT license. For more details, + * consult the file LICENSE in the root of the project. + */ + +#ifndef META_BLANK_FILTER_H_ +#define META_BLANK_FILTER_H_ + +#include + +#include "analyzers/filter_factory.h" +#include "util/clonable.h" +#include "util/optional.h" + +namespace cpptoml +{ +class toml_group; +} + +namespace meta +{ +namespace analyzers +{ +namespace filters +{ + +/** + * Filter that only retains tokens that are within a certain length range, + * inclusive. + */ +class blank_filter : public util::clonable +{ + public: + /** + * Constructs a filter which rejects tokens that do not have any visible + * characters in them. + * @param source Where to read tokens from + */ + blank_filter(std::unique_ptr source); + + /** + * Copy constructor. + * @param other The blank_filter to copy into this one + */ + blank_filter(const blank_filter& other); + + /** + * Sets the content for the beginning of the filter chain. + * @param content The string content to set + */ + void set_content(const std::string& content) override; + + /** + * @return the next token in the sequence + */ + std::string next() override; + + /** + * Determines whether there are more tokens available in the stream. + */ + operator bool() const override; + + /// Identifier for this filter + const static std::string id; + + private: + /** + * Advances internal state to the next valid token. + */ + void next_token(); + + /// The source to read tokens from + std::unique_ptr source_; + + /// The next buffered token + util::optional token_; +}; +} +} +} +#endif diff --git a/include/utf/utf.h b/include/utf/utf.h index 065260ed5..4f51fefc3 100644 --- a/include/utf/utf.h +++ b/include/utf/utf.h @@ -108,6 +108,13 @@ uint64_t length(const std::string& str); * @param codepoint The codepoint in question */ bool isalpha(uint32_t codepoint); + +/** + * @return whether a code point is a blank character + * @param codepoint The codepoint in question + */ +bool isblank(uint32_t codepoint); + } } diff --git a/src/analyzers/analyzer.cpp b/src/analyzers/analyzer.cpp index 76d7bb6c4..b0a31f444 100644 --- a/src/analyzers/analyzer.cpp +++ b/src/analyzers/analyzer.cpp @@ -11,6 +11,7 @@ #include "analyzers/filters/length_filter.h" #include "analyzers/filters/list_filter.h" #include "analyzers/filters/lowercase_filter.h" +#include "analyzers/filters/blank_filter.h" #include "analyzers/filters/porter2_stemmer.h" #include "analyzers/tokenizers/icu_tokenizer.h" #include "corpus/document.h" diff --git a/src/analyzers/filters/CMakeLists.txt b/src/analyzers/filters/CMakeLists.txt index 42fe8fdcb..07e645a35 100644 --- a/src/analyzers/filters/CMakeLists.txt +++ b/src/analyzers/filters/CMakeLists.txt @@ -6,6 +6,7 @@ add_library(meta-filters alpha_filter.cpp filter_factory.cpp icu_filter.cpp length_filter.cpp + blank_filter.cpp list_filter.cpp lowercase_filter porter2_stemmer.cpp diff --git a/src/analyzers/filters/blank_filter.cpp b/src/analyzers/filters/blank_filter.cpp new file mode 100644 index 000000000..ffae4bddc --- /dev/null +++ b/src/analyzers/filters/blank_filter.cpp @@ -0,0 +1,76 @@ +/** + * @file blank_filter.cpp + * @author Sean Massung + */ + +#include "cpptoml.h" +#include "analyzers/filters/blank_filter.h" +#include "utf/utf.h" + +namespace meta +{ +namespace analyzers +{ +namespace filters +{ + +const std::string blank_filter::id = "blank"; + +blank_filter::blank_filter(std::unique_ptr source) + : source_{std::move(source)} +{ + next_token(); +} + +blank_filter::blank_filter(const blank_filter& other) + : source_{other.source_->clone()}, token_{other.token_} +{ + // nothing +} + +void blank_filter::set_content(const std::string& content) +{ + token_ = util::nullopt; + source_->set_content(content); + next_token(); +} + +std::string blank_filter::next() +{ + auto tok = *token_; + next_token(); + return tok; +} + +blank_filter::operator bool() const +{ + return token_ || *source_; +} + +void blank_filter::next_token() +{ + if (!*source_) + { + token_ = util::nullopt; + return; + } + + while (*source_) + { + auto tok = source_->next(); + tok = utf::remove_if(tok, [](uint32_t codepoint) + { + return utf::isblank(codepoint); + }); + auto len = utf::length(tok); + if (len > 0) + { + token_ = tok; + return; + } + } + token_ = util::nullopt; +} +} +} +} diff --git a/src/analyzers/filters/filter_factory.cpp b/src/analyzers/filters/filter_factory.cpp index 0441663bb..cce7e0ac6 100644 --- a/src/analyzers/filters/filter_factory.cpp +++ b/src/analyzers/filters/filter_factory.cpp @@ -13,6 +13,7 @@ #include "analyzers/filters/empty_sentence_filter.h" #include "analyzers/filters/english_normalizer.h" #include "analyzers/filters/icu_filter.h" +#include "analyzers/filters/blank_filter.h" #include "analyzers/filters/length_filter.h" #include "analyzers/filters/list_filter.h" #include "analyzers/filters/lowercase_filter.h" @@ -60,6 +61,7 @@ filter_factory::filter_factory() register_filter(); register_filter(); register_filter(); + register_filter(); } } } diff --git a/src/utf/utf.cpp b/src/utf/utf.cpp index 5abf8e92d..ecc475264 100644 --- a/src/utf/utf.cpp +++ b/src/utf/utf.cpp @@ -128,6 +128,11 @@ bool isalpha(uint32_t codepoint) return u_isalpha(codepoint); } +bool isblank(uint32_t codepoint) +{ + return u_isblank(codepoint); +} + uint64_t length(const std::string& str) { const char* s = str.c_str(); From 37da72ea5d038d055745531d090c6990b7346968 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 8 Dec 2014 14:32:27 -0600 Subject: [PATCH 046/481] update sentence tokenization for language model --- src/lm/diff.cpp | 6 +++--- src/lm/language_model.cpp | 4 ++-- src/lm/sentence.cpp | 4 ++-- src/lm/tools/lm-test.cpp | 42 ++++++++++++++++++++++++--------------- 4 files changed, 33 insertions(+), 23 deletions(-) diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 1c60e047e..62409951b 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -106,7 +106,7 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) } sentence rem_cpy{sent}; - rem_cpy.remove(best_idx); + rem_cpy.remove(best_idx); // EDIT if (seen_.find(rem_cpy.to_string()) == seen_.end()) { add(candidates, rem_cpy); @@ -122,7 +122,7 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) continue; sentence ins_cpy{sent}; - ins_cpy.insert(best_idx, next.first); + ins_cpy.insert(best_idx, next.first); // EDIT if (seen_.find(ins_cpy.to_string()) == seen_.end()) { @@ -131,7 +131,7 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) } sentence sub_cpy{sent}; - sub_cpy.substitute(best_idx, next.first); + sub_cpy.substitute(best_idx, next.first); // EDIT if (seen_.find(sub_cpy.to_string()) == seen_.end()) { diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index e3aa3f890..61e59a0e9 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -13,7 +13,7 @@ #include "analyzers/analyzer.h" #include "analyzers/tokenizers/icu_tokenizer.h" #include "analyzers/filters/lowercase_filter.h" -#include "analyzers/filters/alpha_filter.h" +#include "analyzers/filters/blank_filter.h" #include "analyzers/filters/empty_sentence_filter.h" #include "corpus/corpus.h" #include "util/shim.h" @@ -81,7 +81,7 @@ void language_model::learn_model(const cpptoml::toml_group& config) std::unique_ptr stream; stream = make_unique(); stream = make_unique(std::move(stream)); - stream = make_unique(std::move(stream)); + stream = make_unique(std::move(stream)); stream = make_unique(std::move(stream)); while (corpus->has_next()) diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 61ad35bc0..659e7088a 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -10,7 +10,7 @@ #include "analyzers/analyzer.h" #include "analyzers/tokenizers/icu_tokenizer.h" #include "analyzers/filters/lowercase_filter.h" -#include "analyzers/filters/alpha_filter.h" +#include "analyzers/filters/blank_filter.h" #include "analyzers/filters/empty_sentence_filter.h" namespace meta @@ -23,7 +23,7 @@ sentence::sentence(const std::string& text) std::unique_ptr stream; stream = make_unique(); stream = make_unique(std::move(stream)); - stream = make_unique(std::move(stream)); + stream = make_unique(std::move(stream)); stream = make_unique(std::move(stream)); stream->set_content(text); while (*stream) diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index fbde6f3c4..0f36ede77 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -13,31 +13,41 @@ using namespace meta; int main(int argc, char* argv[]) { + if (argc != 3) + { + std::cerr << "Usage: " << argv[0] << " config.toml sentences.txt" + << std::endl; + return 1; + } + lm::diff correcter{cpptoml::parse_file(argv[1])}; + std::ifstream in{argv[2]}; + std::ofstream out{std::string{argv[2]} + ".out"}; + std::ofstream log{std::string{argv[2]} + ".log"}; std::string line; - while (true) + while (in) { - std::cout << "> "; - std::getline(std::cin, line); + std::getline(in, line); if (line.empty()) - break; + continue; lm::sentence sent{line}; auto candidates = correcter.candidates(sent, true); - std::cout << "Found " << candidates.size() << " candidates." - << std::endl; + out << candidates[0].first.to_string() << std::endl; + log << sent.to_string() << std::endl; + log << "====================================" << std::endl; - for (size_t i = 0; i < 5; ++i) + for (size_t i = 0; i < 5 && i < candidates.size(); ++i) { - std::cout << "====================================" << std::endl; - std::cout << (i + 1) << "." << std::endl; - std::cout << " Sentence: " << candidates[i].first.to_string() - << std::endl; - std::cout << " Score: " << candidates[i].second << std::endl; - std::cout << " Edits:" << std::endl; - for(auto& e: candidates[i].first.operations()) - std::cout << " " << e << std::endl; - std::cout << std::endl; + log << (i + 1) << "."; + log << "\tSentence: " << candidates[i].first.to_string() + << std::endl; + log << "\tScore: " << candidates[i].second << std::endl; + log << "\tEdits:" << std::endl; + for (auto& e : candidates[i].first.operations()) + log << "\t\t" << e << std::endl; + log << std::endl; } + log << "====================================" << std::endl; } } From 453617be56f9f0e96466aa9489757da2f65fd686 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 8 Dec 2014 14:35:50 -0600 Subject: [PATCH 047/481] add all.h file for filters --- include/analyzers/filters/all.h | 10 ++++++++++ src/analyzers/analyzer.cpp | 8 +------- src/analyzers/filters/filter_factory.cpp | 12 +----------- 3 files changed, 12 insertions(+), 18 deletions(-) create mode 100644 include/analyzers/filters/all.h diff --git a/include/analyzers/filters/all.h b/include/analyzers/filters/all.h new file mode 100644 index 000000000..9cf7042f4 --- /dev/null +++ b/include/analyzers/filters/all.h @@ -0,0 +1,10 @@ +#include "analyzers/filters/alpha_filter.h" +#include "analyzers/filters/empty_sentence_filter.h" +#include "analyzers/filters/english_normalizer.h" +#include "analyzers/filters/icu_filter.h" +#include "analyzers/filters/blank_filter.h" +#include "analyzers/filters/length_filter.h" +#include "analyzers/filters/list_filter.h" +#include "analyzers/filters/lowercase_filter.h" +#include "analyzers/filters/porter2_stemmer.h" +#include "analyzers/filters/sentence_boundary.h" diff --git a/src/analyzers/analyzer.cpp b/src/analyzers/analyzer.cpp index b0a31f444..e0bce3e46 100644 --- a/src/analyzers/analyzer.cpp +++ b/src/analyzers/analyzer.cpp @@ -6,13 +6,7 @@ #include "analyzers/filter_factory.h" #include "analyzers/multi_analyzer.h" #include "analyzers/token_stream.h" -#include "analyzers/filters/alpha_filter.h" -#include "analyzers/filters/empty_sentence_filter.h" -#include "analyzers/filters/length_filter.h" -#include "analyzers/filters/list_filter.h" -#include "analyzers/filters/lowercase_filter.h" -#include "analyzers/filters/blank_filter.h" -#include "analyzers/filters/porter2_stemmer.h" +#include "analyzers/filters/all.h" #include "analyzers/tokenizers/icu_tokenizer.h" #include "corpus/document.h" #include "cpptoml.h" diff --git a/src/analyzers/filters/filter_factory.cpp b/src/analyzers/filters/filter_factory.cpp index cce7e0ac6..d3b85db32 100644 --- a/src/analyzers/filters/filter_factory.cpp +++ b/src/analyzers/filters/filter_factory.cpp @@ -8,17 +8,7 @@ #include "analyzers/tokenizers/character_tokenizer.h" #include "analyzers/tokenizers/whitespace_tokenizer.h" #include "analyzers/tokenizers/icu_tokenizer.h" - -#include "analyzers/filters/alpha_filter.h" -#include "analyzers/filters/empty_sentence_filter.h" -#include "analyzers/filters/english_normalizer.h" -#include "analyzers/filters/icu_filter.h" -#include "analyzers/filters/blank_filter.h" -#include "analyzers/filters/length_filter.h" -#include "analyzers/filters/list_filter.h" -#include "analyzers/filters/lowercase_filter.h" -#include "analyzers/filters/porter2_stemmer.h" -#include "analyzers/filters/sentence_boundary.h" +#include "analyzers/filters/all.h" namespace meta { From 25eb1b26884d3f26ca3fc008d9b5e4932223b461 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 9 Dec 2014 21:37:05 -0600 Subject: [PATCH 048/481] allow penalties to be specified in config file --- include/lm/diff.h | 14 ++++++++-- src/lm/diff.cpp | 60 ++++++++++++++++++++++++---------------- src/lm/tools/lm-test.cpp | 15 ++++++++++ 3 files changed, 62 insertions(+), 27 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index 6bae270a4..15fbeddb4 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -99,15 +99,23 @@ class diff void add(PQ& candidates, const sentence& sent); language_model lm_; + uint64_t n_val_; uint64_t max_edits_; + + double base_penalty_; + double insert_penalty_; + double substitute_penalty_; + double remove_penalty_; + + bool use_lm_; std::unordered_map> stems_; std::vector fwords_; std::unordered_set seen_; - static constexpr uint64_t max_cand_size_ = 20; - bool use_lm_; - /// balance between perplexity and edit weights + static constexpr uint64_t max_cand_size_ = 20; + /// balance between perplexity and edit weights; doesn't necessarily matter + /// since penalty weights will scale with different values of lambda static constexpr double lambda_ = 0.5; }; diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 62409951b..06529d714 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -30,6 +30,18 @@ diff::diff(const cpptoml::toml_group& config) : lm_{config} throw diff_exception{"max-edits not specified in config"}; max_edits_ = *edits; + auto b_pen = group->get_as("base-penalty"); + base_penalty_ = b_pen ? *b_pen : 0.0; + + auto i_pen = group->get_as("insert-penalty"); + insert_penalty_ = i_pen ? *i_pen : 0.0; + + auto s_pen = group->get_as("substitute-penalty"); + substitute_penalty_ = s_pen ? *s_pen : 0.0; + + auto r_pen = group->get_as("remove-penalty"); + remove_penalty_ = r_pen ? *r_pen : 0.0; + set_stems(*group); set_function_words(*group); } @@ -71,19 +83,6 @@ void diff::add(PQ& candidates, const sentence& sent) candidates.pop(); } -template -void diff::remove(const sentence& sent, size_t idx, PQ& candidates, - uint64_t depth) -{ - sentence rem_cpy{sent}; - rem_cpy.remove(idx); - if (seen_.find(rem_cpy.to_string()) == seen_.end()) - { - add(candidates, rem_cpy); - step(rem_cpy, candidates, depth + 1); - } -} - template void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) { @@ -105,14 +104,11 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) } } - sentence rem_cpy{sent}; - rem_cpy.remove(best_idx); // EDIT - if (seen_.find(rem_cpy.to_string()) == seen_.end()) - { - add(candidates, rem_cpy); - step(rem_cpy, candidates, depth + 1); - } + insert(sent, best_idx, candidates, depth); + remove(sent, best_idx, candidates, depth); + substitute(sent, best_idx, candidates, depth); + /* best.pop_back(); try { @@ -122,7 +118,8 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) continue; sentence ins_cpy{sent}; - ins_cpy.insert(best_idx, next.first); // EDIT + ins_cpy.insert(best_idx, next.first, + base_penalty_ + insert_penalty_); if (seen_.find(ins_cpy.to_string()) == seen_.end()) { @@ -131,7 +128,8 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) } sentence sub_cpy{sent}; - sub_cpy.substitute(best_idx, next.first); // EDIT + sub_cpy.substitute(best_idx, next.first, + base_penalty_ + substitute_penalty_); if (seen_.find(sub_cpy.to_string()) == seen_.end()) { @@ -144,6 +142,7 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) { // ignore if there are no transitions found } + */ } template @@ -153,7 +152,7 @@ void diff::insert(const sentence& sent, size_t idx, PQ& candidates, for (auto& fw : fwords_) { sentence ins_cpy{sent}; - ins_cpy.insert(idx, fw); + ins_cpy.insert(idx, fw, base_penalty_ + substitute_penalty_); if (seen_.find(ins_cpy.to_string()) == seen_.end()) { add(candidates, ins_cpy); @@ -174,7 +173,7 @@ void diff::substitute(const sentence& sent, size_t idx, PQ& candidates, for (auto& stem : it->second) { sentence subbed{sent}; - subbed.substitute(idx, stem); + subbed.substitute(idx, stem, base_penalty_ + substitute_penalty_); if (seen_.find(subbed.to_string()) == seen_.end()) { add(candidates, subbed); @@ -184,6 +183,19 @@ void diff::substitute(const sentence& sent, size_t idx, PQ& candidates, } } +template +void diff::remove(const sentence& sent, size_t idx, PQ& candidates, + uint64_t depth) +{ + sentence rem_cpy{sent}; + rem_cpy.remove(idx, base_penalty_ + remove_penalty_); + if (seen_.find(rem_cpy.to_string()) == seen_.end()) + { + add(candidates, rem_cpy); + step(rem_cpy, candidates, depth + 1); + } +} + template void diff::step(const sentence& sent, PQ& candidates, size_t depth) { diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index 0f36ede77..0f7513f0b 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -8,6 +8,9 @@ #include "meta.h" #include "lm/diff.h" #include "lm/sentence.h" +#include "logging/logger.h" +#include "util/progress.h" +#include "util/filesystem.h" using namespace meta; @@ -20,22 +23,31 @@ int main(int argc, char* argv[]) return 1; } + logging::set_cerr_logging(); + lm::diff correcter{cpptoml::parse_file(argv[1])}; std::ifstream in{argv[2]}; + auto num_sentences = filesystem::num_lines(argv[2]); + printing::progress prog{"Editing sentences ", num_sentences}; std::ofstream out{std::string{argv[2]} + ".out"}; std::ofstream log{std::string{argv[2]} + ".log"}; std::string line; + size_t done = 0; + double do_nothing = 0; while (in) { std::getline(in, line); if (line.empty()) continue; + prog(done++); lm::sentence sent{line}; auto candidates = correcter.candidates(sent, true); out << candidates[0].first.to_string() << std::endl; log << sent.to_string() << std::endl; log << "====================================" << std::endl; + if (candidates[0].first.operations().empty()) + ++do_nothing; for (size_t i = 0; i < 5 && i < candidates.size(); ++i) { @@ -50,4 +62,7 @@ int main(int argc, char* argv[]) } log << "====================================" << std::endl; } + prog.end(); + + std::cout << "Percent no-ops: " << do_nothing / done << std::endl; } From ff71b9d5cf4f6a636f565a9f192add57f9b09179 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 10 Dec 2014 11:57:00 -0600 Subject: [PATCH 049/481] update params in config file for [diff] --- config.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/config.toml b/config.toml index 00c29b92a..f5252f045 100644 --- a/config.toml +++ b/config.toml @@ -52,6 +52,11 @@ prefix = "../data" dataset = "20newsgroups" n-value = 3 max-edits = 3 +# penalty defaults are all zero (no penalty) +base-penalty = 0.0 # base penalty is for any edit +insert-penalty = 0.0 +substitute-penalty = 0.0 +remove-penalty = 0.0 [language-model] format = "learn" From 3b5ac4cc44dfcaa89d34972d5ad689153eff3f52 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 18 Dec 2014 17:04:37 -0600 Subject: [PATCH 050/481] minor tweaks to diff --- src/analyzers/diff_analyzer.cpp | 7 ++++++- src/lm/diff.cpp | 9 +++++---- src/lm/language_model.cpp | 5 ++--- src/lm/sentence.cpp | 7 +++---- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/analyzers/diff_analyzer.cpp b/src/analyzers/diff_analyzer.cpp index 4ae0dbb66..76de3cb3b 100644 --- a/src/analyzers/diff_analyzer.cpp +++ b/src/analyzers/diff_analyzer.cpp @@ -40,9 +40,14 @@ void diff_analyzer::tokenize(corpus::document& doc) while (*stream_) { auto next = stream_->next(); - buffer += next + " "; if (next == "") + { + // sentence constructor adds and itself sentences.emplace_back(std::move(buffer)); + continue; + } + else if (next != "") + buffer += next + " "; } for(auto& s: sentences) diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 06529d714..4e36a7cea 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -108,7 +108,6 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) remove(sent, best_idx, candidates, depth); substitute(sent, best_idx, candidates, depth); - /* best.pop_back(); try { @@ -142,7 +141,6 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) { // ignore if there are no transitions found } - */ } template @@ -152,7 +150,7 @@ void diff::insert(const sentence& sent, size_t idx, PQ& candidates, for (auto& fw : fwords_) { sentence ins_cpy{sent}; - ins_cpy.insert(idx, fw, base_penalty_ + substitute_penalty_); + ins_cpy.insert(idx, fw, base_penalty_ + insert_penalty_); if (seen_.find(ins_cpy.to_string()) == seen_.end()) { add(candidates, ins_cpy); @@ -168,10 +166,13 @@ void diff::substitute(const sentence& sent, size_t idx, PQ& candidates, std::string stemmed{sent[idx]}; Porter2Stemmer::stem(stemmed); auto it = stems_.find(stemmed); - if (it != stems_.end() && it->second.size() != 1) + if (it != stems_.end()) { for (auto& stem : it->second) { + // don't replace with same word! + if (sent[idx] == stem) + continue; sentence subbed{sent}; subbed.substitute(idx, stem, base_penalty_ + substitute_penalty_); if (seen_.find(subbed.to_string()) == seen_.end()) diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 61e59a0e9..37ef504cc 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -12,9 +12,7 @@ #include #include "analyzers/analyzer.h" #include "analyzers/tokenizers/icu_tokenizer.h" -#include "analyzers/filters/lowercase_filter.h" -#include "analyzers/filters/blank_filter.h" -#include "analyzers/filters/empty_sentence_filter.h" +#include "analyzers/filters/all.h" #include "corpus/corpus.h" #include "util/shim.h" #include "lm/language_model.h" @@ -81,6 +79,7 @@ void language_model::learn_model(const cpptoml::toml_group& config) std::unique_ptr stream; stream = make_unique(); stream = make_unique(std::move(stream)); + stream = make_unique(std::move(stream)); stream = make_unique(std::move(stream)); stream = make_unique(std::move(stream)); diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 659e7088a..21285667e 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -9,9 +9,7 @@ #include "lm/sentence.h" #include "analyzers/analyzer.h" #include "analyzers/tokenizers/icu_tokenizer.h" -#include "analyzers/filters/lowercase_filter.h" -#include "analyzers/filters/blank_filter.h" -#include "analyzers/filters/empty_sentence_filter.h" +#include "analyzers/filters/all.h" namespace meta { @@ -23,6 +21,7 @@ sentence::sentence(const std::string& text) std::unique_ptr stream; stream = make_unique(); stream = make_unique(std::move(stream)); + stream = make_unique(std::move(stream)); stream = make_unique(std::move(stream)); stream = make_unique(std::move(stream)); stream->set_content(text); @@ -32,7 +31,7 @@ sentence::sentence(const std::string& text) if (tokens_.empty()) throw sentence_exception{"empty token stream"}; - // remove sentence markers (they're inserted by the LM) + // remove sentence markers tokens_.pop_front(); tokens_.pop_back(); From 932d3c2eb01a6cb6cca6935df387affb248730cb Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 20 Dec 2014 17:05:43 -0600 Subject: [PATCH 051/481] topic-corpus also calculates statistics on original class label distribution --- src/topics/tools/topic_corpus.cpp | 43 +++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/topics/tools/topic_corpus.cpp b/src/topics/tools/topic_corpus.cpp index 483ec649a..319cb948f 100644 --- a/src/topics/tools/topic_corpus.cpp +++ b/src/topics/tools/topic_corpus.cpp @@ -69,9 +69,52 @@ void create_topic_corpus(const std::string& prefix, const std::string& dataset, auto new_file = prefix + "/" + dataset + "/" + dataset + "-topics.dat.labels"; std::ofstream out_labels{new_file}; + size_t num_topics = 0; for (auto& topic : topic_ids) + { + if (topic > num_topics) + num_topics = topic; out_labels << "t" << topic << std::endl; + } + ++num_topics; // total is one more than highest id std::cout << "Saved new labels file: " << new_file << std::endl; + + // for each topic, what is the distribution of original class labels? + + auto labels_file = prefix + "/" + dataset + "/" + dataset + ".dat.labels"; + std::ifstream orig_labels_in{labels_file}; + std::vector orig_labels; + std::string buf; + while (orig_labels_in >> buf) + orig_labels.push_back(buf); + + std::cout << orig_labels.size() << std::endl; + + std::unordered_map> counts; + uint64_t idx = 0; + for (auto& topic : topic_ids) + { + auto label = orig_labels[idx]; + if (counts[label].empty()) + counts[label].resize(num_topics, 0.0); + ++counts[label][topic]; + ++idx; + } + + std::ofstream out_dist{dataset + ".topic-dist"}; + for (auto& label : counts) + { + out_dist << label.first; + double total = 0.0; + for (auto& count : label.second) + total += count; + for (auto& count : label.second) + out_dist << "\t" << count / total; + out_dist << std::endl; + } + + std::cout << "Saved topic dist file: " << (dataset + ".topic-dist") + << std::endl; } int main(int argc, char* argv[]) From 29cc64b1cd81488b4e0e58d6f6abafb97a78dc15 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 7 Apr 2015 23:54:08 -0500 Subject: [PATCH 052/481] remove unnecessary file (default config file is used for travis) --- travis-config.toml | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 travis-config.toml diff --git a/travis-config.toml b/travis-config.toml deleted file mode 100644 index 78a51a021..000000000 --- a/travis-config.toml +++ /dev/null @@ -1,5 +0,0 @@ -stop-words = "../data/lemur-stopwords.txt" -libsvm-modules = "../deps/libsvm-modules/" -prefix = "../data/" -function-words = "../data/function-words.txt" -query-judgements = "../data/ceeaus/qrels.txt" From fac004b1933ccb301dd278e895532cf4190fe38a Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 8 Apr 2015 19:08:43 -0500 Subject: [PATCH 053/481] Fix bug with reading/writing (very) large integers with compressed_file_*. --- src/io/compressed_file_reader.cpp | 4 ++-- src/io/compressed_file_writer.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/io/compressed_file_reader.cpp b/src/io/compressed_file_reader.cpp index 75bce19c9..404ebabd3 100644 --- a/src/io/compressed_file_reader.cpp +++ b/src/io/compressed_file_reader.cpp @@ -122,10 +122,10 @@ void compressed_file_reader::get_next() for (int64_t bit = numberBits - 1; status_ == 0 && bit >= 0; --bit) { if (read_bit()) - current_value_ |= (1 << bit); + current_value_ |= (1ul << bit); } - current_value_ |= (1 << numberBits); + current_value_ |= (1ul << numberBits); } bool compressed_file_reader::read_bit() diff --git a/src/io/compressed_file_writer.cpp b/src/io/compressed_file_writer.cpp index 35dcc8bbd..158537508 100644 --- a/src/io/compressed_file_writer.cpp +++ b/src/io/compressed_file_writer.cpp @@ -80,7 +80,7 @@ void compressed_file_writer::write(uint64_t value) write_bit(true); for (int64_t bit = length - 1; bit >= 0; --bit) - write_bit(cvalue & 1 << bit); + write_bit(cvalue & (1ul << bit)); } void compressed_file_writer::write_bit(bool bit) From 6651af7c18f3165ae13e07c434d28c19a7fe71d8 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 8 Apr 2015 23:03:59 -0500 Subject: [PATCH 054/481] Allow ycm_extra_conf.py to find compile flags for not-yet-configured files. --- contrib/YouCompleteMe/ycm_extra_conf.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/contrib/YouCompleteMe/ycm_extra_conf.py b/contrib/YouCompleteMe/ycm_extra_conf.py index 662e24e09..15cfa15d5 100644 --- a/contrib/YouCompleteMe/ycm_extra_conf.py +++ b/contrib/YouCompleteMe/ycm_extra_conf.py @@ -74,11 +74,15 @@ def GetCompilationInfoForFile(filename): compilation_info.compiler_flags_.append('c++') return compilation_info - if not os.path.exists(filename): - return None - - return database.GetCompilationInfoForFile(filename) + compilation_info = database.GetCompilationInfoForFile(filename) + # if we can't find this file in our database, fall back to the flags for + # profile.cpp + if not compilation_info.compiler_flags_: + cpp_name = os.path.join(DirectoryOfThisScript(), 'src', 'tools', + 'profile.cpp') + compilation_info = database.GetCompilationInfoForFile(cpp_name) + return compilation_info def FlagsForFile( filename ): compilation_info = GetCompilationInfoForFile(filename) From 8c77fa48dda09d7f6ca2a0ece959ee585f7433ea Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 8 Apr 2015 23:10:32 -0500 Subject: [PATCH 055/481] Change forward_index to use a postings like inverted_index. This is mostly just for consistency, and it helps us consolidate a little bit of code (and makes forward_index a little less messy). --- include/index/disk_index_impl.h | 17 -- include/index/postings_data.h | 15 +- include/index/postings_data.tcc | 23 ++- include/index/postings_file.h | 68 +++++++ include/index/postings_file_writer.h | 55 ++++++ include/io/compressed_file_reader.h | 5 + include/io/compressed_file_writer.h | 6 + src/index/disk_index.cpp | 10 - src/index/forward_index.cpp | 269 ++++++++++++--------------- src/index/inverted_index.cpp | 76 +++----- src/io/compressed_file_reader.cpp | 17 ++ src/io/compressed_file_writer.cpp | 46 ++++- 12 files changed, 354 insertions(+), 253 deletions(-) create mode 100644 include/index/postings_file.h create mode 100644 include/index/postings_file_writer.h diff --git a/include/index/disk_index_impl.h b/include/index/disk_index_impl.h index 7d79f17ab..39fed93a5 100644 --- a/include/index/disk_index_impl.h +++ b/include/index/disk_index_impl.h @@ -96,11 +96,6 @@ class disk_index::disk_index_impl */ void load_label_id_mapping(); - /** - * Loads the postings file. - */ - void load_postings(); - /** * Saves the label_id mapping. */ @@ -135,11 +130,6 @@ class disk_index::disk_index_impl */ void set_unique_terms(doc_id id, uint64_t terms); - /** - * @return the mmap file for the postings. - */ - const io::mmap_file& postings() const; - /** * @return the total number of unique terms in the index. */ @@ -199,13 +189,6 @@ class disk_index::disk_index_impl /// Assigns an integer to each class label (used for liblinear mappings) util::invertible_map label_ids_; - /** - * A pointer to a memory-mapped postings file. It is a pointer because - * we want to delay the initialization of it until the postings file is - * created in some cases. - */ - util::optional postings_; - /// mutex for thread-safe operations mutable std::mutex mutex_; }; diff --git a/include/index/postings_data.h b/include/index/postings_data.h index 444899589..ebfc8ee08 100644 --- a/include/index/postings_data.h +++ b/include/index/postings_data.h @@ -162,12 +162,12 @@ class postings_data return out; out.write(pd.p_id_); - uint32_t size = pd.counts_.size(); + uint64_t size = pd.counts_.size(); out.write(size); for (auto& p : pd.counts_) { out.write(p.first); - out.write(p.second); + out.write(static_cast(p.second)); } return out; @@ -191,17 +191,6 @@ class postings_data */ void read_compressed(io::compressed_file_reader& reader); - /** - * @param out The output stream to write to - */ - void write_libsvm(std::ofstream& out) const - { - out << p_id_; - for (auto& c : counts_) - out << ' ' << (c.first + 1) << ':' << c.second; - out << '\n'; - } - /** * @return the term_id for this postings_data */ diff --git a/include/index/postings_data.tcc b/include/index/postings_data.tcc index 45faf36dc..49927aca4 100644 --- a/include/index/postings_data.tcc +++ b/include/index/postings_data.tcc @@ -109,6 +109,9 @@ void postings_data ::write_compressed(io::compressed_file_writer & writer) const { + // TODO: The special casing here for term_id as PrimaryKey only works + // under debug mode, so doubles will *always* get truncated under + // release mode... count_t mutable_counts{counts_.contents()}; writer.write(mutable_counts[0].first); if (std::is_same::value @@ -118,10 +121,7 @@ void postings_data } else { - uint64_t to_write; - std::memcpy(&to_write, &mutable_counts[0].second, - sizeof(mutable_counts[0].second)); - writer.write(to_write); + writer.write(mutable_counts[0].second); } // use gap encoding on the SecondaryKeys (we know they are integral types) @@ -140,10 +140,7 @@ void postings_data } else { - uint64_t to_write; - std::memcpy(&to_write, &mutable_counts[i].second, - sizeof(mutable_counts[i].second)); - writer.write(to_write); + writer.write(mutable_counts[i].second); } } @@ -169,12 +166,18 @@ void postings_data::read_compressed( // we're using gap encoding last_id += this_id; SecondaryKey key{last_id}; - uint64_t next = reader.next(); + double count; + // TODO: see write_compressed; a similar problem here if (std::is_same::value) + { + uint64_t next = reader.next(); count = static_cast(next); + } else - std::memcpy(&count, &next, sizeof(next)); + { + count = reader.next_double(); + } counts_.emplace_back(key, count); } diff --git a/include/index/postings_file.h b/include/index/postings_file.h new file mode 100644 index 000000000..156243810 --- /dev/null +++ b/include/index/postings_file.h @@ -0,0 +1,68 @@ +/** + * @file postings_file.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_INDEX_POSTINGS_FILE_H_ +#define META_INDEX_POSTINGS_FILE_H_ + +#include "index/postings_data.h" +#include "io/mmap_file.h" +#include "util/disk_vector.h" + +namespace meta +{ +namespace index +{ + +template +class postings_file +{ + public: + using postings_data_type = postings_data; + + /** + * Opens a postings file. + * @param filename The path to the file + */ + postings_file(const std::string& filename) + : postings_{filename}, bit_locations_{filename + ".index"} + { + // nothing + } + + /** + * Obtains a postings data object for the given primary key. + * @param pk The primary key to look up + * @return a shared pointer to the postings data extracted from the + * file + */ + std::shared_ptr find(PrimaryKey pk) const + { + auto pdata = std::make_shared(pk); + uint64_t idx{pk}; + + // if we are in-bounds of the postings file, populate counts + if (idx < bit_locations_.size()) + { + io::compressed_file_reader reader{ + postings_, io::default_compression_reader_func}; + reader.seek(bit_locations_.at(idx)); + + pdata->read_compressed(reader); + } + + return pdata; + } + + private: + io::mmap_file postings_; + util::disk_vector bit_locations_; +}; +} +} +#endif diff --git a/include/index/postings_file_writer.h b/include/index/postings_file_writer.h new file mode 100644 index 000000000..40438d576 --- /dev/null +++ b/include/index/postings_file_writer.h @@ -0,0 +1,55 @@ +/** + * @file postings_file_writer.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_INDEX_POSTINGS_FILE_WRITER_H_ +#define META_INDEX_POSTINGS_FILE_WRITER_H_ + +#include "io/compressed_file_writer.h" +#include "util/disk_vector.h" + +namespace meta +{ +namespace index +{ + +class postings_file_writer +{ + public: + /** + * Opens a postings file for writing. + * @param filename The filename (prefix) for the postings file. + */ + postings_file_writer(const std::string& filename, uint64_t unique_keys) + : output_{filename, io::default_compression_writer_func}, + bit_locations_{filename + ".index", unique_keys}, + id_{0} + { + // nothing + } + + /** + * Writes a postings data object to the file. + * @param pdata The postings_data to be written + */ + template + void write(const PostingsData& pdata) + { + bit_locations_[id_] = output_.bit_location(); + pdata.write_compressed(output_); + ++id_; + } + + private: + io::compressed_file_writer output_; + util::disk_vector bit_locations_; + uint64_t id_; +}; +} +} +#endif diff --git a/include/io/compressed_file_reader.h b/include/io/compressed_file_reader.h index 5ed2962da..b2215021b 100644 --- a/include/io/compressed_file_reader.h +++ b/include/io/compressed_file_reader.h @@ -96,6 +96,11 @@ class compressed_file_reader */ std::string next_string(); + /** + * @return the next double from this compressed file + */ + double next_double(); + /** * @return the current bit location in this file */ diff --git a/include/io/compressed_file_writer.h b/include/io/compressed_file_writer.h index 0b9fc8a64..500356a72 100644 --- a/include/io/compressed_file_writer.h +++ b/include/io/compressed_file_writer.h @@ -59,6 +59,12 @@ class compressed_file_writer */ void write(const std::string& str); + /** + * Writes a double to the file. + * @param value The double to write + */ + void write(double value); + /** * Closes this compressed file. */ diff --git a/src/index/disk_index.cpp b/src/index/disk_index.cpp index 53333a554..faa046d11 100644 --- a/src/index/disk_index.cpp +++ b/src/index/disk_index.cpp @@ -172,11 +172,6 @@ void disk_index::disk_index_impl::load_label_id_mapping() map::load_mapping(label_ids_, index_name_ + files[LABEL_IDS_MAPPING]); } -void disk_index::disk_index_impl::load_postings() -{ - postings_ = io::mmap_file{index_name_ + files[POSTINGS]}; -} - void disk_index::disk_index_impl::save_label_id_mapping() { map::save_mapping(label_ids_, index_name_ + files[LABEL_IDS_MAPPING]); @@ -203,11 +198,6 @@ void disk_index::disk_index_impl::set_unique_terms(doc_id id, uint64_t terms) (*unique_terms_)[id] = terms; } -const io::mmap_file& disk_index::disk_index_impl::postings() const -{ - return *postings_; -} - uint64_t disk_index::disk_index_impl::total_unique_terms() const { return term_id_mapping_->size(); diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index eef93a9fe..0485de956 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -8,7 +8,8 @@ #include "index/disk_index_impl.h" #include "index/forward_index.h" #include "index/inverted_index.h" -#include "index/postings_data.h" +#include "index/postings_file.h" +#include "index/postings_file_writer.h" #include "index/string_list.h" #include "index/string_list_writer.h" #include "index/vocabulary_map.h" @@ -35,23 +36,6 @@ class forward_index::impl */ impl(forward_index* idx); - /** - * This function loads a disk index from its filesystem - * representation. - */ - void load_index(); - - /** - * This function initializes the forward index. - * @param config_file The configuration file used to create the index - */ - void create_index(const std::string& config_file); - - /** - * Initializes this index's metadata structures. - */ - void init_metadata(); - /** * @param config the configuration settings for this index */ @@ -90,11 +74,25 @@ class forward_index::impl */ void compressed_postings_to_libsvm(uint64_t num_docs); + /** + * Compresses the postings file created by uninverting. + * @param filename The file to compress + * @param num_docs The number of documents in that file + */ + void compress(const std::string& filename, uint64_t num_docs); + + /** + * Loads the postings file. + * @param filename The path to the postings file to load + */ + void load_postings(); + /// the total number of unique terms if term_id_mapping_ is unused uint64_t total_unique_terms_; - /// doc_id -> postings file byte location - util::optional> doc_byte_locations_; + /// the postings file + util::optional> postings_; private: /// Pointer to the forward_index this is an implementation of @@ -144,32 +142,28 @@ std::string forward_index::liblinear_data(doc_id d_id) const if (d_id >= num_docs()) throw forward_index_exception{"invalid doc_id in search_primary"}; - uint64_t begin = (*fwd_impl_->doc_byte_locations_)[d_id]; - uint64_t length = 0; - while (impl_->postings()[begin + length] != '\n') - { - ++length; - if (begin + length >= impl_->postings().size()) - throw forward_index_exception{"out of bounds!"}; - } + auto pdata = search_primary(d_id); + std::stringstream out; - return std::string{impl_->postings().begin() + begin, length}; + out << lbl_id(d_id); + for (const auto& count : pdata->counts()) + out << ' ' << (count.first + 1) << ':' << count.second; + return out.str(); } void forward_index::load_index() { LOG(info) << "Loading index from disk: " << index_name() << ENDLG; - fwd_impl_->init_metadata(); - + impl_->initialize_metadata(); impl_->load_doc_id_mapping(); - impl_->load_postings(); auto config = cpptoml::parse_file(index_name() + "/config.toml"); if (!fwd_impl_->is_libsvm_format(config)) impl_->load_term_id_mapping(); impl_->load_label_id_mapping(); + fwd_impl_->load_postings(); std::ifstream unique_terms_file{index_name() + "/corpus.uniqueterms"}; unique_terms_file >> fwd_impl_->total_unique_terms_; @@ -188,7 +182,6 @@ void forward_index::create_index(const std::string& config_file) << ENDLG; fwd_impl_->create_libsvm_postings(config); - fwd_impl_->create_libsvm_metadata(); impl_->save_label_id_mapping(); } else @@ -201,20 +194,22 @@ void forward_index::create_index(const std::string& config_file) auto inv_idx = make_index(config_file); fwd_impl_->create_uninverted_metadata(inv_idx->index_name()); - impl_->load_label_id_mapping(); fwd_impl_->uninvert(*inv_idx); - fwd_impl_->init_metadata(); - impl_->load_postings(); - fwd_impl_->set_doc_byte_locations(); impl_->load_term_id_mapping(); fwd_impl_->total_unique_terms_ = impl_->total_unique_terms(); } - // now that the files are tokenized, we can create the string_list + impl_->load_label_id_mapping(); + fwd_impl_->load_postings(); impl_->load_doc_id_mapping(); + impl_->initialize_metadata(); - std::ofstream unique_terms_file{index_name() + "/corpus.uniqueterms"}; - unique_terms_file << fwd_impl_->total_unique_terms_; + { + std::ofstream unique_terms_file{index_name() + "/corpus.uniqueterms"}; + unique_terms_file << fwd_impl_->total_unique_terms_; + } + + assert(filesystem::file_exists(index_name() + "/corpus.uniqueterms")); LOG(info) << "Done creating index: " << index_name() << ENDLG; } @@ -230,93 +225,67 @@ void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) throw forward_index_exception{ "dataset missing from configuration file"}; - std::string existing_file = *prefix + "/" + *dataset + "/" + *dataset - + ".dat"; - - filesystem::copy_file(existing_file, - idx_->index_name() + idx_->impl_->files[POSTINGS]); - - init_metadata(); + auto libsvm_data = *prefix + "/" + *dataset + "/" + *dataset + ".dat"; + auto filename = idx_->index_name() + idx_->impl_->files[POSTINGS]; - // now, assign byte locations for libsvm doc starting points - idx_->impl_->load_postings(); - set_doc_byte_locations(); -} + uint64_t num_docs = filesystem::num_lines(libsvm_data); + idx_->impl_->initialize_metadata(num_docs); -void forward_index::impl::set_doc_byte_locations() -{ - doc_id d_id{0}; - uint8_t last_byte = '\n'; - printing::progress progress{" > Setting document locations: ", - idx_->impl_->postings().size()}; - for (uint64_t idx = 0; idx < idx_->impl_->postings().size(); ++idx) + total_unique_terms_ = 0; { - progress(idx); - if (last_byte == '\n') + postings_file_writer out{filename, num_docs}; + + printing::progress progress{" > Creating postings from libsvm data: ", + num_docs}; + doc_id d_id{0}; + std::ifstream input{libsvm_data}; + std::string line; + auto docid_writer = idx_->impl_->make_doc_id_writer(num_docs); + while (std::getline(input, line)) { - (*doc_byte_locations_)[d_id] = idx; - ++d_id; - } - last_byte = idx_->impl_->postings()[idx]; - } -} + progress(d_id); -void forward_index::impl::init_metadata() -{ - uint64_t num_docs = filesystem::num_lines(idx_->index_name() - + idx_->impl_->files[POSTINGS]); - idx_->impl_->initialize_metadata(num_docs); - doc_byte_locations_ = util::disk_vector( - idx_->index_name() + "/lexicon.index", num_docs); -} + auto lbl = io::libsvm_parser::label(line); + idx_->impl_->set_label(d_id, lbl); -void forward_index::impl::create_libsvm_metadata() -{ - total_unique_terms_ = 0; + uint64_t num_unique = 0; + double length = 0; + forward_index::postings_data_type pdata{d_id}; - printing::progress progress{" > Creating metadata: ", - doc_byte_locations_->size()}; + auto counts = io::libsvm_parser::counts(line); + for (const auto& count : counts) + { + ++num_unique; + if (count.first > total_unique_terms_) + total_unique_terms_ = count.first; + length += count.second; + } - doc_id d_id{0}; - std::ifstream in{idx_->index_name() + idx_->impl_->files[POSTINGS]}; - std::string line; - auto docid_writer = idx_->impl_->make_doc_id_writer(idx_->num_docs()); - while (in.good()) - { - std::getline(in, line); - if (line.empty()) - break; + pdata.set_counts(counts); + out.write(pdata); - progress(d_id); + docid_writer.insert(d_id, "[no path]"); + idx_->impl_->set_length(d_id, static_cast(length)); + idx_->impl_->set_unique_terms(d_id, num_unique); - class_label lbl = io::libsvm_parser::label(line); - idx_->impl_->set_label(d_id, lbl); - - uint64_t num_unique = 0; - uint64_t length = 0; - for (const auto& count_pair : io::libsvm_parser::counts(line)) - { - ++num_unique; - if (count_pair.first > total_unique_terms_) - total_unique_terms_ = count_pair.first; - length += static_cast(count_pair.second); + ++d_id; } - docid_writer.insert(d_id, "[no path]"); - idx_->impl_->set_length(d_id, length); - idx_->impl_->set_unique_terms(d_id, num_unique); - - ++d_id; + // +1 since we subtracted one from each of the ids in the + // libsvm_parser::counts() function + ++total_unique_terms_; } - ++total_unique_terms_; // since we subtracted one from the ids earlier + LOG(info) << "Created compressed postings file (" + << printing::bytes_to_units(filesystem::file_size(filename)) + << ")" << ENDLG; } void forward_index::impl::create_uninverted_metadata(const std::string& name) { - auto files = {DOC_IDS_MAPPING, DOC_IDS_MAPPING_INDEX, DOC_SIZES, - DOC_LABELS, DOC_UNIQUETERMS, LABEL_IDS_MAPPING, - TERM_IDS_MAPPING, TERM_IDS_MAPPING_INVERSE}; + auto files = {DOC_IDS_MAPPING, DOC_IDS_MAPPING_INDEX, DOC_SIZES, DOC_LABELS, + DOC_UNIQUETERMS, LABEL_IDS_MAPPING, TERM_IDS_MAPPING, + TERM_IDS_MAPPING_INVERSE}; for (const auto& file : files) filesystem::copy_file(name + idx_->impl_->files[file], @@ -341,19 +310,16 @@ uint64_t forward_index::unique_terms() const return fwd_impl_->total_unique_terms_; } -auto forward_index::search_primary( - doc_id d_id) const -> std::shared_ptr +auto forward_index::search_primary(doc_id d_id) const + -> std::shared_ptr { - auto pdata = std::make_shared(d_id); - auto line = liblinear_data(d_id); - pdata->set_counts(io::libsvm_parser::counts(line)); - return pdata; + return fwd_impl_->postings_->find(d_id); } void forward_index::impl::uninvert(const inverted_index& inv_idx) { io::compressed_file_reader inv_reader{inv_idx.index_name() - + idx_->impl_->files[POSTINGS], + + idx_->impl_->files[POSTINGS], io::default_compression_reader_func}; term_id t_id{0}; @@ -370,54 +336,49 @@ void forward_index::impl::uninvert(const inverted_index& inv_idx) } handler.merge_chunks(); - compressed_postings_to_libsvm(inv_idx.num_docs()); + compress(idx_->index_name() + idx_->impl_->files[POSTINGS], + inv_idx.num_docs()); } -void forward_index::impl::compressed_postings_to_libsvm(uint64_t num_docs) +void forward_index::impl::compress(const std::string& filename, + uint64_t num_docs) { - idx_->impl_->load_labels(); + auto ucfilename = filename + ".uncompressed"; + filesystem::rename_file(filename, ucfilename); - auto filename = idx_->index_name() + idx_->impl_->files[POSTINGS]; - filesystem::rename_file(filename, filename + ".tmp"); - std::ofstream output{filename}; - io::compressed_file_reader input{filename + ".tmp", - io::default_compression_reader_func}; - - // handler for writing gaps of blank documents - doc_id last_id{0}; - auto write_gap = [&](doc_id next_id) + // create a scope to ensure the reader and writer close properly so we + // can calculate the size of the compressed file and delete the + // uncompressed version at the end { - while (next_id > last_id + 1) + postings_file_writer out{filename, num_docs}; + + forward_index::postings_data_type pdata; + auto length = filesystem::file_size(ucfilename) * 8; // number of bits + io::compressed_file_reader in{ucfilename, + io::default_compression_reader_func}; + + printing::progress progress{ + " > Compressing postings: ", length, 500, 8 * 1024 /* 1KB */ + }; + // note: we will be accessing pdata in sorted order + while (in.has_next()) { - ++last_id; - index_pdata_type empty; - empty.set_primary_key( - static_cast(idx_->impl_->doc_label_id(last_id))); - empty.write_libsvm(output); + in >> pdata; + progress(in.bit_location()); + out.write(pdata); } - }; - - // read from input, write to output, changing doc_id to class_label for the - // correct libsvm format - index_pdata_type pdata; - while (input >> pdata) - { - doc_id d_id = pdata.primary_key(); - - // write empty document lines for any documents in a gap - write_gap(d_id); - - // write current document - pdata.set_primary_key( - static_cast(idx_->impl_->doc_label_id(d_id))); - pdata.write_libsvm(output); - last_id = d_id; } - // write any trailing empty documents - write_gap(doc_id{num_docs}); + LOG(info) << "Created compressed postings file (" + << printing::bytes_to_units(filesystem::file_size(filename)) + << ")" << ENDLG; - filesystem::delete_file(filename + ".tmp"); + filesystem::delete_file(ucfilename); +} + +void forward_index::impl::load_postings() +{ + postings_ = {idx_->index_name() + idx_->impl_->files[POSTINGS]}; } } } diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 55b0e8355..b5053e09f 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -8,6 +8,8 @@ #include "index/chunk_handler.h" #include "index/disk_index_impl.h" #include "index/inverted_index.h" +#include "index/postings_file.h" +#include "index/postings_file_writer.h" #include "index/string_list.h" #include "index/string_list_writer.h" #include "index/vocabulary_map.h" @@ -63,14 +65,16 @@ class inverted_index::impl */ void compress(const std::string& filename, uint64_t num_unique_terms); + /** + * Loads the postings file. + */ + void load_postings(); + /// The analyzer used to tokenize documents. std::unique_ptr analyzer_; - /** - * PrimaryKey -> postings location. - * Each index corresponds to a PrimaryKey (uint64_t). - */ - util::optional> term_bit_locations_; + util::optional> postings_; /// the total number of term occurrences in the entire corpus uint64_t total_corpus_terms_; @@ -141,7 +145,7 @@ void inverted_index::create_index(const std::string& config_file) impl_->load_term_id_mapping(); impl_->save_label_id_mapping(); - impl_->load_postings(); + inv_impl_->load_postings(); LOG(info) << "Done creating index: " << index_name() << ENDLG; } @@ -155,12 +159,8 @@ void inverted_index::load_index() impl_->initialize_metadata(); impl_->load_doc_id_mapping(); impl_->load_term_id_mapping(); - - inv_impl_->term_bit_locations_ - = util::disk_vector(index_name() + "/lexicon.index"); - impl_->load_label_id_mapping(); - impl_->load_postings(); + inv_impl_->load_postings(); } void inverted_index::impl::tokenize_docs(corpus::corpus* docs, @@ -221,49 +221,46 @@ void inverted_index::impl::tokenize_docs(corpus::corpus* docs, void inverted_index::impl::compress(const std::string& filename, uint64_t num_unique_terms) { - std::string cfilename{filename + ".compressed"}; + std::string ucfilename{filename + ".uncompressed"}; + filesystem::rename_file(filename, ucfilename); - // create scope so the writer closes and we can calculate the size of the - // file as well as rename it + // create a scope to ensure the reader and writer close properly so we + // can calculate the size of the compressed file and delete the + // uncompressed version at the end { - io::compressed_file_writer out{cfilename, - io::default_compression_writer_func}; + postings_file_writer out{filename, num_unique_terms}; vocabulary_map_writer vocab{idx_->index_name() + idx_->impl_->files[TERM_IDS_MAPPING]}; postings_data pdata; - auto length = filesystem::file_size(filename) * 8; // number of bits - io::compressed_file_reader in{filename, + auto length = filesystem::file_size(ucfilename) * 8; // number of bits + io::compressed_file_reader in{ucfilename, io::default_compression_reader_func}; - // allocate memory for the term_id -> term location mapping now - // that we know how many terms there are - term_bit_locations_ = util::disk_vector( - idx_->index_name() + "/lexicon.index", num_unique_terms); - printing::progress progress{ " > Compressing postings: ", length, 500, 8 * 1024 /* 1KB */ }; // note: we will be accessing pdata in sorted order - term_id t_id{0}; while (in.has_next()) { in >> pdata; progress(in.bit_location()); vocab.insert(pdata.primary_key()); - (*term_bit_locations_)[t_id] = out.bit_location(); - pdata.write_compressed(out); - ++t_id; + out.write(pdata); } } LOG(info) << "Created compressed postings file (" - << printing::bytes_to_units(filesystem::file_size(cfilename)) + << printing::bytes_to_units(filesystem::file_size(filename)) << ")" << ENDLG; - filesystem::delete_file(filename); - filesystem::rename_file(cfilename, filename); + filesystem::delete_file(ucfilename); +} + +void inverted_index::impl::load_postings() +{ + postings_ = {idx_->index_name() + idx_->impl_->files[POSTINGS]}; } uint64_t inverted_index::term_freq(term_id t_id, doc_id d_id) const @@ -309,23 +306,10 @@ uint64_t inverted_index::doc_freq(term_id t_id) const return search_primary(t_id)->counts().size(); } -auto inverted_index::search_primary( - term_id t_id) const -> std::shared_ptr +auto inverted_index::search_primary(term_id t_id) const + -> std::shared_ptr { - uint64_t idx{t_id}; - - // if the term doesn't exist in the index, return an empty postings_data - if (idx >= inv_impl_->term_bit_locations_->size()) - return std::make_shared(t_id); - - io::compressed_file_reader reader{impl_->postings(), - io::default_compression_reader_func}; - reader.seek(inv_impl_->term_bit_locations_->at(idx)); - - auto pdata = std::make_shared(t_id); - pdata->read_compressed(reader); - - return pdata; + return inv_impl_->postings_->find(t_id); } } } diff --git a/src/io/compressed_file_reader.cpp b/src/io/compressed_file_reader.cpp index 404ebabd3..0869ee2d2 100644 --- a/src/io/compressed_file_reader.cpp +++ b/src/io/compressed_file_reader.cpp @@ -3,6 +3,7 @@ * @author Sean Massung */ +#include #include #include "io/compressed_file_reader.h" #include "io/mmap_file.h" @@ -74,6 +75,22 @@ std::string compressed_file_reader::next_string() return str; } +double compressed_file_reader::next_double() +{ + auto byte = mapping_(current_value_); + auto msign = read_bit() ? -1 : 1; + auto mantissa = static_cast(byte) * msign; + + get_next(); + byte = mapping_(current_value_); + + auto esign = read_bit() ? -1 : 1; + auto exponent = static_cast(byte) * esign; + get_next(); + + return mantissa * std::pow(2.0, exponent); +} + void compressed_file_reader::seek(uint64_t bit_offset) { uint64_t byte = bit_offset / 8; diff --git a/src/io/compressed_file_writer.cpp b/src/io/compressed_file_writer.cpp index 158537508..7b1e8ff05 100644 --- a/src/io/compressed_file_writer.cpp +++ b/src/io/compressed_file_writer.cpp @@ -13,9 +13,8 @@ namespace meta namespace io { -compressed_file_writer::compressed_file_writer(const std::string& filename, - std::function - mapping) +compressed_file_writer::compressed_file_writer( + const std::string& filename, std::function mapping) : outfile_{fopen(filename.c_str(), "w")}, char_cursor_{0}, bit_cursor_{0}, @@ -45,6 +44,47 @@ void compressed_file_writer::write(const std::string& str) } } +void compressed_file_writer::write(double value) +{ + // http://stackoverflow.com/questions/5672960/how-can-i-extract-the-mantissa-of-a-double + // http://dlib.net/dlib/float_details.h.html + // + // Essentially, we write the double as two integers: the mantissa and + // the exponent, such that mantissa * std::pow(2, exponent) = value. + int exp; + auto digits = std::numeric_limits::digits; + auto mantissa + = static_cast(std::frexp(value, &exp) * (1ul << digits)); + int16_t exponent = exp - digits; + + // see dlib link above; tries to shrink mantissa for more efficient + // serialization + for (int i = 0; i < 8 && (mantissa & 0xFF) == 0; ++i) + { + mantissa >>= 8; + exponent += 8; + } + + // write in a weird backwards order to make reading in easier later + bool sign = false; + if (mantissa < 0) + { + sign = true; + mantissa *= -1; + } + write(static_cast(mantissa)); + write_bit(sign); + + sign = false; + if (exponent < 0) + { + sign = true; + exponent *= -1; + } + write(static_cast(exponent)); + write_bit(sign); +} + uint64_t compressed_file_writer::bit_location() const { return bit_location_; From de6737da9fb147d27af180b03df629b7b7a5f9dd Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 8 Apr 2015 23:11:39 -0500 Subject: [PATCH 056/481] Fix a bug where forward index would always be detected as incomplete. This occurs when the index is created from libsvm data and there is thus no TERM_ID_MAPPING file. --- src/index/forward_index.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 0485de956..4a79a91cd 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -119,13 +119,17 @@ bool forward_index::valid() const { if (!filesystem::file_exists(index_name() + "/corpus.uniqueterms")) { - LOG(info) - << "Existing forward index detected as invalid; recreating" - << ENDLG; + LOG(info) << "Existing forward index detected as invalid; recreating" + << ENDLG; return false; } for (auto& f : impl_->files) { + // this is not required if generated directly from libsvm data + if (f == impl_->files[TERM_IDS_MAPPING] + || f == impl_->files[TERM_IDS_MAPPING_INVERSE]) + continue; + if (!filesystem::file_exists(index_name() + "/" + std::string{f})) { LOG(info) From 29315a1caafe827bb7a4f17d3f9ba07bf0a3ffde Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 8 Apr 2015 23:41:01 -0500 Subject: [PATCH 057/481] Consistent naming for index files. --- include/index/postings_file.h | 2 +- include/index/postings_file_writer.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/index/postings_file.h b/include/index/postings_file.h index 156243810..9f27adfab 100644 --- a/include/index/postings_file.h +++ b/include/index/postings_file.h @@ -30,7 +30,7 @@ class postings_file * @param filename The path to the file */ postings_file(const std::string& filename) - : postings_{filename}, bit_locations_{filename + ".index"} + : postings_{filename}, bit_locations_{filename + "_index"} { // nothing } diff --git a/include/index/postings_file_writer.h b/include/index/postings_file_writer.h index 40438d576..883ccba27 100644 --- a/include/index/postings_file_writer.h +++ b/include/index/postings_file_writer.h @@ -27,7 +27,7 @@ class postings_file_writer */ postings_file_writer(const std::string& filename, uint64_t unique_keys) : output_{filename, io::default_compression_writer_func}, - bit_locations_{filename + ".index", unique_keys}, + bit_locations_{filename + "_index", unique_keys}, id_{0} { // nothing From 63ae78f97bbb377ce843a9547c22d6423277e73b Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 8 Apr 2015 23:41:32 -0500 Subject: [PATCH 058/481] Fix issue where doubles were truncated in release mode in indexing. This only matters for forward_index since this is currently the only place where we might actually have doubles for feature count values. --- include/index/postings_data.h | 2 ++ include/index/postings_data.tcc | 14 +++++--------- include/index/postings_file.h | 3 ++- include/index/postings_file_writer.h | 4 ++-- src/index/forward_index.cpp | 8 ++++---- 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/include/index/postings_data.h b/include/index/postings_data.h index ebfc8ee08..afa094cbd 100644 --- a/include/index/postings_data.h +++ b/include/index/postings_data.h @@ -180,6 +180,7 @@ class postings_data * file. * @param writer The compressed file to write to */ + template void write_compressed(io::compressed_file_writer& writer) const; /** @@ -189,6 +190,7 @@ class postings_data * file. * @param reader The compressed file to read from */ + template void read_compressed(io::compressed_file_reader& reader); /** diff --git a/include/index/postings_data.tcc b/include/index/postings_data.tcc index 49927aca4..75e629df1 100644 --- a/include/index/postings_data.tcc +++ b/include/index/postings_data.tcc @@ -105,17 +105,14 @@ PrimaryKey postings_data::primary_key() const } template +template void postings_data ::write_compressed(io::compressed_file_writer & writer) const { - // TODO: The special casing here for term_id as PrimaryKey only works - // under debug mode, so doubles will *always* get truncated under - // release mode... count_t mutable_counts{counts_.contents()}; writer.write(mutable_counts[0].first); - if (std::is_same::value - || std::is_same::value) + if (std::is_same::value) { writer.write(static_cast(mutable_counts[0].second)); } @@ -133,8 +130,7 @@ void postings_data cur_id = temp_id; writer.write(mutable_counts[i].first); - if (std::is_same::value - || std::is_same::value) + if (std::is_same::value) { writer.write(static_cast(mutable_counts[i].second)); } @@ -149,6 +145,7 @@ void postings_data } template +template void postings_data::read_compressed( io::compressed_file_reader& reader) { @@ -168,8 +165,7 @@ void postings_data::read_compressed( SecondaryKey key{last_id}; double count; - // TODO: see write_compressed; a similar problem here - if (std::is_same::value) + if (std::is_same::value) { uint64_t next = reader.next(); count = static_cast(next); diff --git a/include/index/postings_file.h b/include/index/postings_file.h index 9f27adfab..f2d1bb42c 100644 --- a/include/index/postings_file.h +++ b/include/index/postings_file.h @@ -41,6 +41,7 @@ class postings_file * @return a shared pointer to the postings data extracted from the * file */ + template std::shared_ptr find(PrimaryKey pk) const { auto pdata = std::make_shared(pk); @@ -53,7 +54,7 @@ class postings_file postings_, io::default_compression_reader_func}; reader.seek(bit_locations_.at(idx)); - pdata->read_compressed(reader); + pdata->template read_compressed(reader); } return pdata; diff --git a/include/index/postings_file_writer.h b/include/index/postings_file_writer.h index 883ccba27..1af16d5eb 100644 --- a/include/index/postings_file_writer.h +++ b/include/index/postings_file_writer.h @@ -37,11 +37,11 @@ class postings_file_writer * Writes a postings data object to the file. * @param pdata The postings_data to be written */ - template + template void write(const PostingsData& pdata) { bit_locations_[id_] = output_.bit_location(); - pdata.write_compressed(output_); + pdata.template write_compressed(output_); ++id_; } diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 4a79a91cd..9f4ae547a 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -266,7 +266,7 @@ void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) } pdata.set_counts(counts); - out.write(pdata); + out.write(pdata); docid_writer.insert(d_id, "[no path]"); idx_->impl_->set_length(d_id, static_cast(length)); @@ -317,7 +317,7 @@ uint64_t forward_index::unique_terms() const auto forward_index::search_primary(doc_id d_id) const -> std::shared_ptr { - return fwd_impl_->postings_->find(d_id); + return fwd_impl_->postings_->find(d_id); } void forward_index::impl::uninvert(const inverted_index& inv_idx) @@ -333,7 +333,7 @@ void forward_index::impl::uninvert(const inverted_index& inv_idx) while (inv_reader.has_next()) { inverted_pdata_type pdata{t_id}; - pdata.read_compressed(inv_reader); + pdata.read_compressed(inv_reader); producer(pdata.primary_key(), pdata.counts()); ++t_id; } @@ -369,7 +369,7 @@ void forward_index::impl::compress(const std::string& filename, { in >> pdata; progress(in.bit_location()); - out.write(pdata); + out.write(pdata); } } From 62b055a94407128e34db7cea70339ac1e268bbf5 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 8 Apr 2015 23:48:42 -0500 Subject: [PATCH 059/481] Remove libsvm_analyzer. This isn't used anywhere, even if we are indexing libsvm formatted data (the forward_index class manually invokes io::libsvm_parser functions). --- include/analyzers/all.h | 2 -- include/analyzers/libsvm_analyzer.h | 41 ----------------------------- src/analyzers/CMakeLists.txt | 1 - src/analyzers/analyzer_factory.cpp | 1 - src/analyzers/libsvm_analyzer.cpp | 29 -------------------- 5 files changed, 74 deletions(-) delete mode 100644 include/analyzers/libsvm_analyzer.h delete mode 100644 src/analyzers/libsvm_analyzer.cpp diff --git a/include/analyzers/all.h b/include/analyzers/all.h index fe53d1de4..99f09f541 100644 --- a/include/analyzers/all.h +++ b/include/analyzers/all.h @@ -1,7 +1,5 @@ #include "analyzers/analyzer.h" #include "analyzers/multi_analyzer.h" -#include "analyzers/libsvm_analyzer.h" - #include "analyzers/ngram/ngram_analyzer.h" #include "analyzers/ngram/ngram_word_analyzer.h" diff --git a/include/analyzers/libsvm_analyzer.h b/include/analyzers/libsvm_analyzer.h deleted file mode 100644 index 34bea38e8..000000000 --- a/include/analyzers/libsvm_analyzer.h +++ /dev/null @@ -1,41 +0,0 @@ -/** - * @file libsvm_analyzer.h - * @author Sean Massung - * - * All files in META are dual-licensed under the MIT and NCSA licenses. For more - * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the - * project. - */ - -#ifndef META_LIBSVM_ANALYZER_ -#define META_LIBSVM_ANALYZER_ - -#include "analyzers/analyzer.h" -#include "util/clonable.h" - -namespace meta -{ -namespace analyzers -{ - -/** - * libsvm_analyzer tokenizes documents that have been created from a - * line_corpus, where each line is in libsvm input format and stored in the - * document's content field. - */ -class libsvm_analyzer : public util::clonable -{ - public: - /** - * Tokenizes a file into a document. - * @param doc The document to store the tokenized information in - */ - virtual void tokenize(corpus::document& doc) override; - - /// Identifier for this analyzer. - const static std::string id; -}; -} -} - -#endif diff --git a/src/analyzers/CMakeLists.txt b/src/analyzers/CMakeLists.txt index 8d5def411..7cf7e05af 100644 --- a/src/analyzers/CMakeLists.txt +++ b/src/analyzers/CMakeLists.txt @@ -6,7 +6,6 @@ add_subdirectory(tools) add_library(meta-analyzers analyzer.cpp analyzer_factory.cpp - libsvm_analyzer.cpp multi_analyzer.cpp ngram/ngram_analyzer.cpp ngram/ngram_word_analyzer.cpp) diff --git a/src/analyzers/analyzer_factory.cpp b/src/analyzers/analyzer_factory.cpp index 257eeeb86..3c9fd28b0 100644 --- a/src/analyzers/analyzer_factory.cpp +++ b/src/analyzers/analyzer_factory.cpp @@ -21,7 +21,6 @@ analyzer_factory::analyzer_factory() { // built-in analyzers register_analyzer(); - register_analyzer(); } } } diff --git a/src/analyzers/libsvm_analyzer.cpp b/src/analyzers/libsvm_analyzer.cpp deleted file mode 100644 index 707507e55..000000000 --- a/src/analyzers/libsvm_analyzer.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/** - * @file libsvm_analyzer.cpp - * @author Sean Massung - */ - -#include - -#include "corpus/document.h" -#include "io/libsvm_parser.h" -#include "analyzers/libsvm_analyzer.h" - -namespace meta -{ -namespace analyzers -{ - -const std::string libsvm_analyzer::id = "libsvm"; - -void libsvm_analyzer::tokenize(corpus::document& doc) -{ - for (auto& count_pair : io::libsvm_parser::counts(doc.content(), false)) - doc.increment(std::to_string(count_pair.first), count_pair.second); - - // label info is inside the document content for libsvm format; the line - // corpus will not set it since it's not in a separate file - doc.label(io::libsvm_parser::label(doc.content())); -} -} -} From e7d23a4a68b09461256b5af35c0df7685d678663 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 9 Apr 2015 12:37:49 -0500 Subject: [PATCH 060/481] Remove unused function declarations in index impls. --- src/index/forward_index.cpp | 16 ---------------- src/index/inverted_index.cpp | 9 --------- 2 files changed, 25 deletions(-) diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 9f4ae547a..2cb9f9e5b 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -41,11 +41,6 @@ class forward_index::impl */ void create_libsvm_postings(const cpptoml::table& config); - /** - * Initializes structures based on a libsvm-formatted file. - */ - void create_libsvm_metadata(); - /** * @param inv_idx The inverted index to uninvert */ @@ -63,17 +58,6 @@ class forward_index::impl */ bool is_libsvm_format(const cpptoml::table& config) const; - /** - * Calculates which documents start at which bytes in the postings file. - */ - void set_doc_byte_locations(); - - /** - * Converts postings.index into a libsvm formatted file - * @param num_docs The total number of documents - */ - void compressed_postings_to_libsvm(uint64_t num_docs); - /** * Compresses the postings file created by uninverting. * @param filename The file to compress diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index b5053e09f..a474acf1c 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -51,15 +51,6 @@ class inverted_index::impl void tokenize_docs(corpus::corpus* docs, chunk_handler& handler); - /** - * Creates the lexicon file (or "dictionary") which has pointers into - * the large postings file - * @param postings_file - * @param lexicon_file - */ - void create_lexicon(const std::string& postings_file, - const std::string& lexicon_file); - /** * Compresses the large postings file. */ From 4187aed113948b97488a9bedf1be26e438b1ea23 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 9 Apr 2015 13:01:27 -0500 Subject: [PATCH 061/481] Add tool to convert a meta forward index to libsvm format. This dumps the whole index: users will probably want to use head/tail to select just the training/test portions they want. --- src/index/tools/CMakeLists.txt | 3 +++ src/index/tools/forward-to-libsvm.cpp | 34 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 src/index/tools/forward-to-libsvm.cpp diff --git a/src/index/tools/CMakeLists.txt b/src/index/tools/CMakeLists.txt index 5565bb576..ebbdda31b 100644 --- a/src/index/tools/CMakeLists.txt +++ b/src/index/tools/CMakeLists.txt @@ -23,3 +23,6 @@ target_link_libraries(print-vocab meta-index) add_executable(search-vocab search-vocab.cpp) target_link_libraries(search-vocab meta-index) + +add_executable(forward-to-libsvm forward-to-libsvm.cpp) +target_link_libraries(forward-to-libsvm meta-index) diff --git a/src/index/tools/forward-to-libsvm.cpp b/src/index/tools/forward-to-libsvm.cpp new file mode 100644 index 000000000..38551afe2 --- /dev/null +++ b/src/index/tools/forward-to-libsvm.cpp @@ -0,0 +1,34 @@ +/** + * @file forward-to-libsvm.cpp + * @author Chase Geigle + */ + +#include "index/forward_index.h" + +using namespace meta; + +int main(int argc, char** argv) +{ + if (argc != 3) + { + std::cerr << "Usage:\t" << argv[0] << " config.toml output-file" + << std::endl; + return 1; + } + + logging::set_cerr_logging(); + + auto idx = index::make_index(argv[1]); + { + std::ofstream output{argv[2]}; + printing::progress progress{" > Converting to libsvm: ", + idx->num_docs()}; + for (const auto& did : idx->docs()) + { + progress(did); + output << idx->liblinear_data(did) << "\n"; + } + } + + return 0; +} From 5763fb9b1b6fa3d668833d74af444aa5704d4e43 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 9 Apr 2015 14:12:28 -0500 Subject: [PATCH 062/481] Switch expected to also be a double for sgd loss functions. (This is in preparation for supporting regression in addition to classification.) --- include/classify/loss/hinge.h | 4 ++-- include/classify/loss/huber.h | 4 ++-- include/classify/loss/least_squares.h | 4 ++-- include/classify/loss/logistic.h | 4 ++-- include/classify/loss/loss_function.h | 4 ++-- include/classify/loss/modified_huber.h | 4 ++-- include/classify/loss/perceptron.h | 4 ++-- include/classify/loss/smooth_hinge.h | 4 ++-- include/classify/loss/squared_hinge.h | 4 ++-- src/classify/loss/hinge.cpp | 4 ++-- src/classify/loss/huber.cpp | 4 ++-- src/classify/loss/least_squares.cpp | 4 ++-- src/classify/loss/logistic.cpp | 4 ++-- src/classify/loss/modified_huber.cpp | 4 ++-- src/classify/loss/perceptron.cpp | 4 ++-- src/classify/loss/smooth_hinge.cpp | 4 ++-- src/classify/loss/squared_hinge.cpp | 4 ++-- 17 files changed, 34 insertions(+), 34 deletions(-) diff --git a/include/classify/loss/hinge.h b/include/classify/loss/hinge.h index ceb8865a4..35fc4cfdc 100644 --- a/include/classify/loss/hinge.h +++ b/include/classify/loss/hinge.h @@ -35,8 +35,8 @@ struct hinge : public loss_function */ const static std::string id; - double loss(double prediction, int expected) const override; - double derivative(double prediction, int expected) const override; + double loss(double prediction, double expected) const override; + double derivative(double prediction, double expected) const override; }; } } diff --git a/include/classify/loss/huber.h b/include/classify/loss/huber.h index f2a7c1d06..ae1ccbdf8 100644 --- a/include/classify/loss/huber.h +++ b/include/classify/loss/huber.h @@ -32,8 +32,8 @@ struct huber : public loss_function */ const static std::string id; - double loss(double prediction, int expected) const override; - double derivative(double prediction, int expected) const override; + double loss(double prediction, double expected) const override; + double derivative(double prediction, double expected) const override; }; } } diff --git a/include/classify/loss/least_squares.h b/include/classify/loss/least_squares.h index 37eca2446..159573997 100644 --- a/include/classify/loss/least_squares.h +++ b/include/classify/loss/least_squares.h @@ -30,8 +30,8 @@ struct least_squares : public loss_function */ const static std::string id; - double loss(double prediction, int expected) const override; - double derivative(double prediction, int expected) const override; + double loss(double prediction, double expected) const override; + double derivative(double prediction, double expected) const override; }; } } diff --git a/include/classify/loss/logistic.h b/include/classify/loss/logistic.h index 3629cdbe2..416dceb63 100644 --- a/include/classify/loss/logistic.h +++ b/include/classify/loss/logistic.h @@ -31,8 +31,8 @@ struct logistic : public loss_function */ const static std::string id; - double loss(double prediction, int expected) const override; - double derivative(double prediction, int expected) const override; + double loss(double prediction, double expected) const override; + double derivative(double prediction, double expected) const override; }; } } diff --git a/include/classify/loss/loss_function.h b/include/classify/loss/loss_function.h index ea9684067..77d4cd6bb 100644 --- a/include/classify/loss/loss_function.h +++ b/include/classify/loss/loss_function.h @@ -38,7 +38,7 @@ struct loss_function * prediction * @return the loss incurred */ - virtual double loss(double prediction, int expected) const = 0; + virtual double loss(double prediction, double expected) const = 0; /** * The derivative of the loss function given a predicted value and the @@ -50,7 +50,7 @@ struct loss_function * prediction * @return the derivative of the loss function at that point */ - virtual double derivative(double prediction, int expected) const = 0; + virtual double derivative(double prediction, double expected) const = 0; }; } } diff --git a/include/classify/loss/modified_huber.h b/include/classify/loss/modified_huber.h index 0585617e4..a8bf026a5 100644 --- a/include/classify/loss/modified_huber.h +++ b/include/classify/loss/modified_huber.h @@ -31,8 +31,8 @@ struct modified_huber : public loss_function */ const static std::string id; - double loss(double prediction, int expected) const override; - double derivative(double prediction, int expected) const override; + double loss(double prediction, double expected) const override; + double derivative(double prediction, double expected) const override; }; } } diff --git a/include/classify/loss/perceptron.h b/include/classify/loss/perceptron.h index a5accbda9..064a3e8b4 100644 --- a/include/classify/loss/perceptron.h +++ b/include/classify/loss/perceptron.h @@ -30,8 +30,8 @@ struct perceptron : public loss_function */ const static std::string id; - double loss(double prediction, int expected) const override; - double derivative(double prediction, int expected) const override; + double loss(double prediction, double expected) const override; + double derivative(double prediction, double expected) const override; }; } } diff --git a/include/classify/loss/smooth_hinge.h b/include/classify/loss/smooth_hinge.h index 26561babc..25ec35211 100644 --- a/include/classify/loss/smooth_hinge.h +++ b/include/classify/loss/smooth_hinge.h @@ -31,8 +31,8 @@ struct smooth_hinge : public loss_function */ const static std::string id; - double loss(double prediction, int expected) const override; - double derivative(double prediction, int expected) const override; + double loss(double prediction, double expected) const override; + double derivative(double prediction, double expected) const override; }; } } diff --git a/include/classify/loss/squared_hinge.h b/include/classify/loss/squared_hinge.h index fc177bb07..1897d50d3 100644 --- a/include/classify/loss/squared_hinge.h +++ b/include/classify/loss/squared_hinge.h @@ -30,8 +30,8 @@ struct squared_hinge : public loss_function */ const static std::string id; - double loss(double prediction, int expected) const override; - double derivative(double prediction, int expected) const override; + double loss(double prediction, double expected) const override; + double derivative(double prediction, double expected) const override; }; } } diff --git a/src/classify/loss/hinge.cpp b/src/classify/loss/hinge.cpp index 243f82fef..ec0d6b3f2 100644 --- a/src/classify/loss/hinge.cpp +++ b/src/classify/loss/hinge.cpp @@ -14,7 +14,7 @@ namespace loss const std::string hinge::id = "hinge"; -double hinge::loss(double prediction, int expected) const +double hinge::loss(double prediction, double expected) const { double z = prediction * expected; if (z < 1) @@ -22,7 +22,7 @@ double hinge::loss(double prediction, int expected) const return 0; } -double hinge::derivative(double prediction, int expected) const +double hinge::derivative(double prediction, double expected) const { double z = prediction * expected; if (z < 1) diff --git a/src/classify/loss/huber.cpp b/src/classify/loss/huber.cpp index b5c8b96f9..8a8ab14c1 100644 --- a/src/classify/loss/huber.cpp +++ b/src/classify/loss/huber.cpp @@ -14,7 +14,7 @@ namespace loss const std::string huber::id = "huber"; -double huber::loss(double prediction, int expected) const +double huber::loss(double prediction, double expected) const { double abs_diff = std::abs(prediction - expected); if (abs_diff <= 1) @@ -22,7 +22,7 @@ double huber::loss(double prediction, int expected) const return 2 * abs_diff - 1; } -double huber::derivative(double prediction, int expected) const +double huber::derivative(double prediction, double expected) const { double diff = prediction - expected; if (std::abs(diff) <= 1) diff --git a/src/classify/loss/least_squares.cpp b/src/classify/loss/least_squares.cpp index 6671e5e5a..53d0280ba 100644 --- a/src/classify/loss/least_squares.cpp +++ b/src/classify/loss/least_squares.cpp @@ -14,12 +14,12 @@ namespace loss const std::string least_squares::id = "least-squares"; -double least_squares::loss(double prediction, int expected) const +double least_squares::loss(double prediction, double expected) const { return 0.5 * (prediction - expected) * (prediction - expected); } -double least_squares::derivative(double prediction, int expected) const +double least_squares::derivative(double prediction, double expected) const { return prediction - expected; } diff --git a/src/classify/loss/logistic.cpp b/src/classify/loss/logistic.cpp index 759536b80..438c9781f 100644 --- a/src/classify/loss/logistic.cpp +++ b/src/classify/loss/logistic.cpp @@ -14,12 +14,12 @@ namespace loss const std::string logistic::id = "logistic"; -double logistic::loss(double prediction, int expected) const +double logistic::loss(double prediction, double expected) const { return std::log(1 + std::exp(-prediction * expected)); } -double logistic::derivative(double prediction, int expected) const +double logistic::derivative(double prediction, double expected) const { return -expected / (std::exp(prediction * expected) + 1); } diff --git a/src/classify/loss/modified_huber.cpp b/src/classify/loss/modified_huber.cpp index c8a33bf5e..0901fad3b 100644 --- a/src/classify/loss/modified_huber.cpp +++ b/src/classify/loss/modified_huber.cpp @@ -14,7 +14,7 @@ namespace loss const std::string modified_huber::id = "modified-huber"; -double modified_huber::loss(double prediction, int expected) const +double modified_huber::loss(double prediction, double expected) const { double z = prediction * expected; if (z < -1) @@ -24,7 +24,7 @@ double modified_huber::loss(double prediction, int expected) const return 0.5 * (1 - z) * (1 - z); } -double modified_huber::derivative(double prediction, int expected) const +double modified_huber::derivative(double prediction, double expected) const { double z = prediction * expected; if (z < -1) diff --git a/src/classify/loss/perceptron.cpp b/src/classify/loss/perceptron.cpp index 3ba531dab..9b484f02e 100644 --- a/src/classify/loss/perceptron.cpp +++ b/src/classify/loss/perceptron.cpp @@ -14,14 +14,14 @@ namespace loss const std::string perceptron::id = "perceptron"; -double perceptron::loss(double prediction, int expected) const +double perceptron::loss(double prediction, double expected) const { if (prediction * expected <= 0) return -expected * prediction; return 0; } -double perceptron::derivative(double prediction, int expected) const +double perceptron::derivative(double prediction, double expected) const { if (prediction * expected <= 0) return -expected; diff --git a/src/classify/loss/smooth_hinge.cpp b/src/classify/loss/smooth_hinge.cpp index 6c9547e1e..c1a728d2a 100644 --- a/src/classify/loss/smooth_hinge.cpp +++ b/src/classify/loss/smooth_hinge.cpp @@ -14,7 +14,7 @@ namespace loss const std::string smooth_hinge::id = "smooth-hinge"; -double smooth_hinge::loss(double prediction, int expected) const +double smooth_hinge::loss(double prediction, double expected) const { double z = prediction * expected; if (z <= 0) @@ -24,7 +24,7 @@ double smooth_hinge::loss(double prediction, int expected) const return 0.5 * (1 - prediction * expected) * (1 - prediction * expected); } -double smooth_hinge::derivative(double prediction, int expected) const +double smooth_hinge::derivative(double prediction, double expected) const { double z = prediction * expected; if (z <= 0) diff --git a/src/classify/loss/squared_hinge.cpp b/src/classify/loss/squared_hinge.cpp index b0f947a49..e4a4bf99a 100644 --- a/src/classify/loss/squared_hinge.cpp +++ b/src/classify/loss/squared_hinge.cpp @@ -14,7 +14,7 @@ namespace loss const std::string squared_hinge::id = "squared-hinge"; -double squared_hinge::loss(double prediction, int expected) const +double squared_hinge::loss(double prediction, double expected) const { double z = prediction * expected; if (z < 1) @@ -22,7 +22,7 @@ double squared_hinge::loss(double prediction, int expected) const return 0; } -double squared_hinge::derivative(double prediction, int expected) const +double squared_hinge::derivative(double prediction, double expected) const { double z = prediction * expected; if (z < 1) From 1aaa132cf55bf21f73fe1ae3fb57d5aaff9220e7 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 9 Apr 2015 15:03:11 -0500 Subject: [PATCH 063/481] Move corpus configuration to its own config file. We now read corpus configuration from the "dataset" folder under the "corpus" filename. That file is expected to specify the corpus type, encoding (if not utf-8), file list (if needed), and (eventually) metadata schema. --- src/corpus/corpus.cpp | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/corpus/corpus.cpp b/src/corpus/corpus.cpp index 5f02b47c2..33f32708e 100644 --- a/src/corpus/corpus.cpp +++ b/src/corpus/corpus.cpp @@ -6,6 +6,7 @@ #include "corpus/corpus.h" #include "corpus/all.h" #include "cpptoml.h" +#include "util/filesystem.h" #include "util/shim.h" namespace meta @@ -27,9 +28,9 @@ std::unique_ptr corpus::load(const std::string& config_file) { auto config = cpptoml::parse_file(config_file); - auto type = config.get_as("corpus-type"); - if (!type) - throw corpus_exception{"corpus-type missing from configuration file"}; + auto corpus = config.get_as("corpus"); + if (!corpus) + throw corpus_exception{"corpus missing from configuration file"}; auto prefix = config.get_as("prefix"); if (!prefix) @@ -39,7 +40,17 @@ std::unique_ptr corpus::load(const std::string& config_file) if (!dataset) throw corpus_exception{"dataset missing from configuration file"}; - auto enc = config.get_as("encoding"); + auto corpus_filename = *prefix + "/" + *dataset + "/" + *corpus; + if (!filesystem::file_exists(corpus_filename)) + throw corpus_exception{"corpus configuration file (" + corpus_filename + + ") not present"}; + + auto corpus_config = cpptoml::parse_file(corpus_filename); + auto type = corpus_config.get_as("type"); + if (!type) + throw corpus_exception{"type missing from corpus configuration file"}; + + auto enc = corpus_config.get_as("encoding"); std::string encoding; if (enc) encoding = *enc; @@ -48,9 +59,10 @@ std::unique_ptr corpus::load(const std::string& config_file) if (*type == "file-corpus") { - auto file_list = config.get_as("list"); + auto file_list = corpus_config.get_as("list"); if (!file_list) - throw corpus_exception{"list missing from configuration file"}; + throw corpus_exception{ + "list missing from corpus configuration file"}; std::string file = *prefix + "/" + *dataset + "/" + *file_list + "-full-corpus.txt"; @@ -61,7 +73,7 @@ std::unique_ptr corpus::load(const std::string& config_file) { std::string filename = *prefix + "/" + *dataset + "/" + *dataset + ".dat"; - auto lines = config.get_as("num-lines"); + auto lines = corpus_config.get_as("num-lines"); if (!lines) return make_unique(filename, encoding); return make_unique(filename, encoding, From 0ffadf755fddc74cf02819c7b6a14b9e9c703844 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 10 Apr 2015 04:46:07 -0500 Subject: [PATCH 064/481] Add io::(write|read)_packed_binary functions. This allows us to write integers/doubles as binary using a nice endianness-independent representation that typically uses less space for smaller numbers. It doesn't have a huge impact on model file sizes that are then gzip compressed, but it shaves off several megabytes from e.g. a perceptron tagger's decompressed model file. --- include/io/binary.h | 113 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/include/io/binary.h b/include/io/binary.h index 76ed08a6e..5597e6dd9 100644 --- a/include/io/binary.h +++ b/include/io/binary.h @@ -11,7 +11,10 @@ #ifndef META_IO_BINARY_H_ #define META_IO_BINARY_H_ +#include +#include #include +#include #include namespace meta @@ -60,7 +63,117 @@ inline void read_binary(std::istream& in, std::string& str) { std::getline(in, str, '\0'); } + +/** + * Writes an integral type in a packed representation. The first byte is a + * flag byte used to indicate two things: the first bit indicates the sign + * of the number, and then the lowest four bits indicates the length (in + * bytes) of the unsigned number that follows. + * + * @see http://dlib.net/dlib/serialize.h.html + * @param out The stream to write to + * @param elem The integral type to write in packed format + */ +template +void write_packed_binary(std::ostream& out, T elem) +{ + static_assert(std::is_integral::value, + "packed binary requires integers"); + + std::array buffer; + if (elem < 0) + { + elem *= -1; + buffer[0] = 0x80; + } + else + { + buffer[0] = 0; + } + + uint8_t idx = 1; + for (; idx <= sizeof(T) && elem > 0; ++idx) + { + buffer[idx] = static_cast(elem & 0xFF); + elem >>= 8; + } + buffer[0] |= (idx - 1); + out.write(reinterpret_cast(&buffer[0]), idx); +} + +/** + * Writes a double in a packed integer binary representation. This splits + * the double into its mantissa and exponent such that + * mantissa * std::pow(2.0, exponent) == elem. The mantissa and exponent + * are integers are are written using the integer packed format. + * + * @see + *http://stackoverflow.com/questions/5672960/how-can-i-extract-the-mantissa-of-a-double + * @see http://dlib.net/dlib/float_details.h.html + * @param out The stream to write to + * @param elem The double to write in packed format + */ +inline void write_packed_binary(std::ostream& out, double elem) +{ + int exp; + auto digits = std::numeric_limits::digits; + auto mantissa + = static_cast(std::frexp(elem, &exp) * (1ul << digits)); + int16_t exponent = exp - digits; + + // see dlib link above; tries to shrink mantissa for more efficient + // serialization + for (uint8_t i = 0; i < sizeof(mantissa) && (mantissa & 0xFF) == 0; ++i) + { + mantissa >>= 8; + exponent += 8; + } + + write_packed_binary(out, mantissa); + write_packed_binary(out, exponent); } + +/** + * Reads an integer from its packed binary representation. + * @param in The stream to read from + * @param elem The element to write into + */ +template +void read_packed_binary(std::istream& in, T& elem) +{ + static_assert(std::is_integral::value, + "packed binary requires integers"); + + auto flag_byte = static_cast(in.get()); + auto size = flag_byte & 0x0F; + + elem = 0; + for (uint8_t idx = 0; idx < size; ++idx) + { + auto byte = static_cast(in.get()); + byte <<= 8 * idx; + elem |= byte; + } + + if (std::is_signed::value && (flag_byte & 0x80) > 0) + { + elem *= -1; + } } +/** + * Reads a double from its packed binary representation. + * @param in The stream to read from + * @param elem The element to write into + */ +inline void read_packed_binary(std::istream& in, double& elem) +{ + int64_t mantissa; + int16_t exponent; + read_packed_binary(in, mantissa); + read_packed_binary(in, exponent); + elem = mantissa * std::pow(2.0, exponent); +} +} +} #endif From 82f9303689740599b3c5fa5274216afdf6b3e36e Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 10 Apr 2015 19:20:24 -0500 Subject: [PATCH 065/481] Generalize read_packed_binary functions' input stream. Support "streams" that don't have to inherit from std::ostream for e.g. reading from just a char array. --- include/io/binary.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/io/binary.h b/include/io/binary.h index 5597e6dd9..90d735b78 100644 --- a/include/io/binary.h +++ b/include/io/binary.h @@ -138,8 +138,8 @@ inline void write_packed_binary(std::ostream& out, double elem) * @param in The stream to read from * @param elem The element to write into */ -template -void read_packed_binary(std::istream& in, T& elem) +template +void read_packed_binary(InputStream& in, T& elem) { static_assert(std::is_integral::value, "packed binary requires integers"); @@ -166,7 +166,8 @@ void read_packed_binary(std::istream& in, T& elem) * @param in The stream to read from * @param elem The element to write into */ -inline void read_packed_binary(std::istream& in, double& elem) +template +void read_packed_binary(InputStream& in, double& elem) { int64_t mantissa; int16_t exponent; From c39e4ae17b72587560b74b4ccaf1579bb364db51 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 10 Apr 2015 19:21:38 -0500 Subject: [PATCH 066/481] Add the read interface for document metadata. --- include/index/metadata.h | 195 ++++++++++++++++++++++++++++++++++ include/index/metadata_file.h | 80 ++++++++++++++ src/index/metadata_file.cpp | 60 +++++++++++ 3 files changed, 335 insertions(+) create mode 100644 include/index/metadata.h create mode 100644 include/index/metadata_file.h create mode 100644 src/index/metadata_file.cpp diff --git a/include/index/metadata.h b/include/index/metadata.h new file mode 100644 index 000000000..0414cd8b9 --- /dev/null +++ b/include/index/metadata.h @@ -0,0 +1,195 @@ +/** + * @file metadata_file.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_INDEX_METADATA_H_ +#define META_INDEX_METADATA_H_ + +#include +#include +#include + +#include "io/binary.h" + +namespace meta +{ +namespace index +{ + +class metadata +{ + public: + /** + * Type tag for a field. + */ + enum class field_type : uint8_t + { + SIGNED_INT = 0, + UNSIGNED_INT, + DOUBLE, + STRING + }; + + /** + * Pair for storing the schema: contains its name and type. + */ + struct field_info + { + std::string name; + field_type type; + }; + + using schema = std::vector; + + metadata(const char* start, const schema& sch) + : schema_{sch}, stream_{start} + { + // nothing + } + + template + T get(const std::string& name) + { + for (uint64_t i = 0; i < stored_fields_.size(); ++i) + { + if (schema_[i].name == name) + return stored_fields_[i]; + } + + for (uint64_t i = stored_fields_.size(); i < schema_.size(); ++i) + { + switch (schema_[i].type) + { + case field_type::SIGNED_INT: + int64_t si; + io::read_packed_binary(stream_, si); + stored_fields_.emplace_back(si); + break; + case field_type::UNSIGNED_INT: + uint64_t ui; + io::read_packed_binary(stream_, ui); + stored_fields_.emplace_back(ui); + break; + case field_type::DOUBLE: + double d; + io::read_packed_binary(stream_, d); + stored_fields_.emplace_back(d); + break; + case field_type::STRING: + std::string s{stream_.input_}; + stream_.input_ += s.size() + 1; + stored_fields_.emplace_back(std::move(s)); + break; + } + + if (schema_[i].name == name) + return stored_fields_[i]; + } + + throw exception{"metadata column \"" + name + "\" not found"}; + } + + class exception : public std::runtime_error + { + public: + using std::runtime_error::runtime_error; + }; + + private: + /** + * Tagged union to represent a single metadata field. + */ + struct field + { + union + { + int64_t sign_int; + uint64_t usign_int; + double doub; + std::string str; + }; + + field_type type; + + field(int64_t sgn) + : sign_int{sgn}, type{field_type::SIGNED_INT} + { + // nothing + } + + field(uint64_t usgn) + : usign_int{usgn}, type{field_type::UNSIGNED_INT} + { + // nothing + } + + field(double d) : doub{d}, type{field_type::DOUBLE} + { + // nothing + } + + field(std::string&& s) : type{field_type::STRING} + { + new (&str) std::string(std::move(s)); + } + + ~field() + { + // invoke string destructor if needed + if (type == field_type::STRING) + (&str)->~decltype(str)(); + } + + operator int64_t() const + { + return sign_int; + } + + operator uint64_t() const + { + return usign_int; + } + + operator double() const + { + return doub; + } + + operator std::string() const + { + return str; + } + }; + + struct metadata_input_stream + { + metadata_input_stream(const char* input) : input_{input} + { + // nothing + } + + char get() + { + return *input_++; + } + + const char* input_; + }; + + /// reference to the metadata_file's schema + const schema& schema_; + + /// the fake input stream used for read_packed_binary + metadata_input_stream stream_; + + /// storage for decoded fields + std::vector stored_fields_; +}; +} +} +#endif diff --git a/include/index/metadata_file.h b/include/index/metadata_file.h new file mode 100644 index 000000000..93c4c5b0b --- /dev/null +++ b/include/index/metadata_file.h @@ -0,0 +1,80 @@ +/** + * @file metadata_file.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_INDEX_METADATA_FILE_H_ +#define META_INDEX_METADATA_FILE_H_ + +#include "util/disk_vector.h" +#include "index/metadata.h" +#include "io/mmap_file.h" + +namespace meta +{ +namespace index +{ + +/** + * Used for reading document-level metadata for an index. + * + * The following two-file format is used: + * + * - metadata.index: disk vector indexed by document id, denoting the seek + * position for each document's metdata in the metadata.db file. + * + * - metadata.db: + * - =>
^ + * -
=> ^( + 2) + * - => PackedInt + * - => + * - => String + * - => field_type + * - => ^FieldNum + * - => PackedInt + * - => PackedInt + * - => PackedInt | PackedDouble | String (depending on + * in the metadata.index header) + * + * is the number of user-supplied metadata fields (they must + * be present for all documents). We add two in the grammar above since we + * always represent the length (integer) and unique-terms (integer) as + * metadata. The "length" and "unique-terms" metadata names are + * **reserved**, but there can be more metadata if the user supplies it. + */ +class metadata_file +{ + public: + /** + * Opens the metadata file stored at prefix. + */ + metadata_file(const std::string& prefix); + + /** + * Obtains metadata for a document. The object returned is a proxy and + * will look up metadata upon first request. If metadata is requested + * multiple times from the same metadata object, it will not be + * re-parsed from the file. + * + * @param d_id The document id to look up metadata for + * @return the metadata for the document + */ + metadata get(doc_id d_id) const; + + private: + /// the schema for this file + metadata::schema schema_; + + /// the seek positions for every document in this file + util::disk_vector index_; + + /// the mapped file for reading metadata from + io::mmap_file md_db_; +}; +} +} +#endif diff --git a/src/index/metadata_file.cpp b/src/index/metadata_file.cpp new file mode 100644 index 000000000..d2090d646 --- /dev/null +++ b/src/index/metadata_file.cpp @@ -0,0 +1,60 @@ +/** + * @file metadata_file.cpp + * @author Chase Geigle + */ + +#include "index/metadata_file.h" +#include "io/binary.h" + +namespace meta +{ +namespace index +{ + +namespace +{ +struct char_input_stream +{ + char_input_stream(const char* input) : input_{input} + { + // nothing + } + + char get() + { + return *input_++; + } + + const char* input_; +}; +} + +metadata_file::metadata_file(const std::string& prefix) + : index_{prefix + "/metadata.index"}, md_db_{prefix + "/metadata.db"} +{ + // read in the header to populate the schema + char_input_stream stream{md_db_.begin()}; + uint64_t num_fields; + io::read_packed_binary(stream, num_fields); + + schema_.reserve(num_fields); + for (uint64_t i = 0; i < num_fields; ++i) + { + metadata::field_info info; + info.name = std::string{stream.input_}; + stream.input_ += info.name.size() + 1; + io::read_packed_binary(stream, info.type); + schema_.emplace_back(std::move(info)); + } +} + +metadata metadata_file::get(doc_id d_id) const +{ + if (d_id >= index_.size()) + throw metadata::exception{"invalid doc id in metadata retrieval"}; + + uint64_t seek_pos = index_[d_id]; + return {md_db_.begin() + seek_pos, schema_}; +} +} +} From 2bfc91c9c98f64b535a3d73a0a7799ba321c17c9 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 10 Apr 2015 22:26:22 -0500 Subject: [PATCH 067/481] Add function for obtaining the metadata schema from cpptoml::table. --- include/index/metadata.h | 16 +++++++---- src/index/metadata.cpp | 61 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 5 deletions(-) create mode 100644 src/index/metadata.cpp diff --git a/include/index/metadata.h b/include/index/metadata.h index 0414cd8b9..c2f1ccc15 100644 --- a/include/index/metadata.h +++ b/include/index/metadata.h @@ -1,5 +1,5 @@ /** - * @file metadata_file.h + * @file metadata.h * @author Chase Geigle * * All files in META are dual-licensed under the MIT and NCSA licenses. For more @@ -14,6 +14,7 @@ #include #include +#include "cpptoml.h" #include "io/binary.h" namespace meta @@ -116,14 +117,12 @@ class metadata field_type type; - field(int64_t sgn) - : sign_int{sgn}, type{field_type::SIGNED_INT} + field(int64_t sgn) : sign_int{sgn}, type{field_type::SIGNED_INT} { // nothing } - field(uint64_t usgn) - : usign_int{usgn}, type{field_type::UNSIGNED_INT} + field(uint64_t usgn) : usign_int{usgn}, type{field_type::UNSIGNED_INT} { // nothing } @@ -190,6 +189,13 @@ class metadata /// storage for decoded fields std::vector stored_fields_; }; + +/** + * Extracts a metadata schema from a configuration file. + * @param config The configuration group that specifies the metadata + * @return the corresponding metadata::schema object. + */ +metadata::schema metadata_schema(const cpptoml::table& config); } } #endif diff --git a/src/index/metadata.cpp b/src/index/metadata.cpp new file mode 100644 index 000000000..a91e6e1f4 --- /dev/null +++ b/src/index/metadata.cpp @@ -0,0 +1,61 @@ +/** + * @file metadata.cpp + * @author Chase Geigle + */ + +#include "index/metadata.h" + +namespace meta +{ +namespace index +{ + +metadata::schema metadata_schema(const cpptoml::table& config) +{ + metadata::schema schema; + if (auto metadata = config.get_table_array("metadata")) + { + const auto& arr = metadata->get(); + schema.reserve(arr.size() + 2); + schema.emplace_back("length", metadata::field_type::UNSIGNED_INT); + schema.emplace_back("unique-terms", metadata::field_type::UNSIGNED_INT); + for (const auto& table : arr) + { + auto name = table->get_as("name"); + auto type = table->get_as("type"); + + if (!name) + throw metadata::exception{"name needed for metadata field"}; + + if (!type) + throw metadata::exception{"type needed for metadata field"}; + + index::metadata::field_type ftype; + if (*type == "int") + { + ftype = metadata::field_type::SIGNED_INT; + } + else if (*type == "uint") + { + ftype = metadata::field_type::UNSIGNED_INT; + } + else if (*type == "double") + { + ftype = metadata::field_type::DOUBLE; + } + else if (*type == "string") + { + ftype = metadata::field_type::STRING; + } + else + { + throw metadata::exception{"invalid metadata type: \"" + *type + + "\""}; + } + schema.emplace_back(*name, ftype); + } + } + return schema; +} +} +} From a9145752c41ff249e3cd159bd7003f55f5c8d718 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 10 Apr 2015 22:52:25 -0500 Subject: [PATCH 068/481] Inject schema loading into create_index. (This has some hacks to make the compilers happy...) --- include/index/metadata.h | 17 +++++++++++++++-- src/index/CMakeLists.txt | 3 ++- src/index/inverted_index.cpp | 10 ++++++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/include/index/metadata.h b/include/index/metadata.h index c2f1ccc15..2f88fb74e 100644 --- a/include/index/metadata.h +++ b/include/index/metadata.h @@ -43,9 +43,22 @@ class metadata { std::string name; field_type type; + + field_info() = default; + field_info(std::string n, field_type ft) : name{std::move(n)}, type{ft} + { + // nothing + } + field_info(const field_info&) = default; + field_info(field_info&&) = default; + field_info& operator=(const field_info&) = default; + field_info& operator=(field_info&&) = default; + ~field_info() = default; }; - using schema = std::vector; + // I want the below to be a const field_info, but g++ gives a cryptic + // compiler error in that case... clang++ accepts it just fine. -sigh- + using schema = std::vector; metadata(const char* start, const schema& sch) : schema_{sch}, stream_{start} @@ -141,7 +154,7 @@ class metadata { // invoke string destructor if needed if (type == field_type::STRING) - (&str)->~decltype(str)(); + (&str)->~basic_string(); } operator int64_t() const diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 06825181b..c7b188d81 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -5,8 +5,9 @@ add_subdirectory(ranker) add_subdirectory(tools) add_library(meta-index disk_index.cpp - inverted_index.cpp forward_index.cpp + inverted_index.cpp + metadata.cpp string_list.cpp string_list_writer.cpp vocabulary_map.cpp diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index a474acf1c..abab6dc7e 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -8,6 +8,7 @@ #include "index/chunk_handler.h" #include "index/disk_index_impl.h" #include "index/inverted_index.h" +#include "index/metadata.h" #include "index/postings_file.h" #include "index/postings_file_writer.h" #include "index/string_list.h" @@ -115,6 +116,15 @@ void inverted_index::create_index(const std::string& config_file) // load the documents from the corpus auto docs = corpus::corpus::load(config_file); + // load the metadata schema for the corpus + auto config = cpptoml::parse_file(config_file); + auto prefix = config.get_as("prefix"); + auto dataset = config.get_as("dataset"); + auto corpus = config.get_as("corpus"); + auto corpus_config + = cpptoml::parse_file(*prefix + "/" + *dataset + "/" + *corpus); + auto schema = metadata_schema(corpus_config); + uint64_t num_docs = docs->size(); impl_->initialize_metadata(num_docs); From 2fcb76f9c5aeffae8c07688891ae532e70c9583b Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 10 Apr 2015 23:15:02 -0500 Subject: [PATCH 069/481] Add return value to write_packed_binary functions. Now returns the number of bytes used to write the element. --- include/io/binary.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/io/binary.h b/include/io/binary.h index 90d735b78..9ba0c9cfb 100644 --- a/include/io/binary.h +++ b/include/io/binary.h @@ -73,9 +73,10 @@ inline void read_binary(std::istream& in, std::string& str) * @see http://dlib.net/dlib/serialize.h.html * @param out The stream to write to * @param elem The integral type to write in packed format + * @return the number of bytes used to write out elem */ template -void write_packed_binary(std::ostream& out, T elem) +uint64_t write_packed_binary(std::ostream& out, T elem) { static_assert(std::is_integral::value, "packed binary requires integers"); @@ -99,6 +100,7 @@ void write_packed_binary(std::ostream& out, T elem) } buffer[0] |= (idx - 1); out.write(reinterpret_cast(&buffer[0]), idx); + return idx; } /** @@ -112,8 +114,9 @@ void write_packed_binary(std::ostream& out, T elem) * @see http://dlib.net/dlib/float_details.h.html * @param out The stream to write to * @param elem The double to write in packed format + * @return the number of bytes used to write out elem */ -inline void write_packed_binary(std::ostream& out, double elem) +inline uint64_t write_packed_binary(std::ostream& out, double elem) { int exp; auto digits = std::numeric_limits::digits; @@ -129,8 +132,9 @@ inline void write_packed_binary(std::ostream& out, double elem) exponent += 8; } - write_packed_binary(out, mantissa); - write_packed_binary(out, exponent); + auto bytes = write_packed_binary(out, mantissa); + bytes += write_packed_binary(out, exponent); + return bytes; } /** From a93737ba5c0a1106f08decc048a6ef1a7b8b20e9 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 11 Apr 2015 16:14:42 -0500 Subject: [PATCH 070/481] Return bytes written from regular io::write_binary functions. --- include/io/binary.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/io/binary.h b/include/io/binary.h index 9ba0c9cfb..9a4e47008 100644 --- a/include/io/binary.h +++ b/include/io/binary.h @@ -28,9 +28,10 @@ namespace io * @param elem The element to write */ template -void write_binary(std::ostream& out, const T& elem) +uint64_t write_binary(std::ostream& out, const T& elem) { out.write(reinterpret_cast(&elem), sizeof(T)); + return sizeof(T); } /** @@ -38,9 +39,10 @@ void write_binary(std::ostream& out, const T& elem) * @param out The stream to write to * @param str the string to write */ -inline void write_binary(std::ostream& out, const std::string& str) +inline uint64_t write_binary(std::ostream& out, const std::string& str) { out.write(str.c_str(), str.size() + 1); + return str.size() + 1; } /** From c05e00590544c6fda5b1d936686418e1a92330cc Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 11 Apr 2015 16:15:11 -0500 Subject: [PATCH 071/481] Only have schema contain non-mandatory metadata fields. --- src/index/metadata.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/index/metadata.cpp b/src/index/metadata.cpp index a91e6e1f4..e7896b753 100644 --- a/src/index/metadata.cpp +++ b/src/index/metadata.cpp @@ -16,9 +16,7 @@ metadata::schema metadata_schema(const cpptoml::table& config) if (auto metadata = config.get_table_array("metadata")) { const auto& arr = metadata->get(); - schema.reserve(arr.size() + 2); - schema.emplace_back("length", metadata::field_type::UNSIGNED_INT); - schema.emplace_back("unique-terms", metadata::field_type::UNSIGNED_INT); + schema.reserve(arr.size()); for (const auto& table : arr) { auto name = table->get_as("name"); From 2e5e68f145d8d55f29bde8e8ed976a39548dea5d Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 11 Apr 2015 23:02:54 -0500 Subject: [PATCH 072/481] Add metadata writing to index creation. The index now uses the metadata file for reading document paths (if they exist), lengths, and unique terms. The inverted-index unit tests are now passing. --- include/index/disk_index_impl.h | 76 ++++---------------------------- include/index/metadata.h | 36 ++++++++++++--- include/index/metadata_file.h | 5 +++ include/index/metadata_parser.h | 34 ++++++++++++++ include/index/metadata_writer.h | 41 +++++++++++++++++ include/io/binary.h | 2 +- src/index/CMakeLists.txt | 3 ++ src/index/disk_index.cpp | 57 +++++------------------- src/index/forward_index.cpp | 20 ++++----- src/index/inverted_index.cpp | 40 +++++++++++------ src/index/metadata_file.cpp | 18 ++++++-- src/index/metadata_parser.cpp | 52 ++++++++++++++++++++++ src/index/metadata_writer.cpp | 73 ++++++++++++++++++++++++++++++ src/test/inverted_index_test.cpp | 3 +- 14 files changed, 312 insertions(+), 148 deletions(-) create mode 100644 include/index/metadata_parser.h create mode 100644 include/index/metadata_writer.h create mode 100644 src/index/metadata_parser.cpp create mode 100644 src/index/metadata_writer.cpp diff --git a/include/index/disk_index_impl.h b/include/index/disk_index_impl.h index 39fed93a5..3cf91ff60 100644 --- a/include/index/disk_index_impl.h +++ b/include/index/disk_index_impl.h @@ -13,6 +13,7 @@ #include #include "index/disk_index.h" +#include "index/metadata_file.h" #include "index/string_list.h" #include "index/vocabulary_map.h" #include "util/disk_vector.h" @@ -31,15 +32,14 @@ class string_list_writer; */ enum index_file { - DOC_IDS_MAPPING = 0, - DOC_IDS_MAPPING_INDEX, - DOC_SIZES, DOC_LABELS, - DOC_UNIQUETERMS, LABEL_IDS_MAPPING, POSTINGS, + POSTINGS_INDEX, TERM_IDS_MAPPING, - TERM_IDS_MAPPING_INVERSE + TERM_IDS_MAPPING_INVERSE, + METADATA_DB, + METADATA_INDEX }; /** @@ -57,17 +57,9 @@ class disk_index::disk_index_impl const static std::vector files; /** - * Initializes the following metadata maps: - * doc_sizes_, labels_, unique_terms_ - * @param num_docs The number of documents stored in the index - */ - void initialize_metadata(uint64_t num_docs = 0); - - /** - * Loads the doc sizes. - * @param num_docs The number of documents stored in the index + * Loads the metadata file. */ - void load_doc_sizes(uint64_t num_docs = 0); + void initialize_metadata(); /** * Loads the doc labels. @@ -75,17 +67,6 @@ class disk_index::disk_index_impl */ void load_labels(uint64_t num_docs = 0); - /** - * Loads the unique terms per document. - * @param num_docs The number of documents stored in the index - */ - void load_unique_terms(uint64_t num_docs = 0); - - /** - * Loads the doc_id mapping. - */ - void load_doc_id_mapping(); - /** * Loads the term_id mapping. */ @@ -101,14 +82,6 @@ class disk_index::disk_index_impl */ void save_label_id_mapping(); - /** - * Creates a string_list_writer for writing the docids mapping. - * @param num_docs The number of documents stored in the index, as the size - * of the string_list_writer - * @return the string_list_writer to write doc ids - */ - string_list_writer make_doc_id_writer(uint64_t num_docs) const; - /** * Sets the label for a document. * @param id The document id @@ -116,20 +89,6 @@ class disk_index::disk_index_impl */ void set_label(doc_id id, const class_label& label); - /** - * Sets the size of a document. - * @param id The document id - * @param length The number of terms that will appear in the document - */ - void set_length(doc_id id, uint64_t length); - - /** - * Sets the number of unique terms for a document. - * @param id The document id - * @param terms The number of unique terms that will appear in the document - */ - void set_unique_terms(doc_id id, uint64_t terms); - /** * @return the total number of unique terms in the index. */ @@ -157,31 +116,14 @@ class disk_index::disk_index_impl /// the location of this index std::string index_name_; - /** - * doc_id -> document path mapping. - * Each index corresponds to a doc_id (uint64_t). - */ - util::optional doc_id_mapping_; - - /** - * doc_id -> document length mapping. - * Each index corresponds to a doc_id (uint64_t). - */ - util::optional> doc_sizes_; - /** * Maps which class a document belongs to (if any). * Each index corresponds to a doc_id (uint64_t). */ util::optional> labels_; - /** - * Holds how many unique terms there are per-document. This is sort of - * like an inverse IDF. For a forward_index, this field is certainly - * redundant, though it can save querying the postings file. - * Each index corresponds to a doc_id (uint64_t). - */ - util::optional> unique_terms_; + /// Stores additional metadata for each document + util::optional metadata_; /// Maps string terms to term_ids. util::optional term_id_mapping_; diff --git a/include/index/metadata.h b/include/index/metadata.h index 2f88fb74e..977053a1b 100644 --- a/include/index/metadata.h +++ b/include/index/metadata.h @@ -16,6 +16,7 @@ #include "cpptoml.h" #include "io/binary.h" +#include "util/optional.h" namespace meta { @@ -67,12 +68,12 @@ class metadata } template - T get(const std::string& name) + util::optional get(const std::string& name) { for (uint64_t i = 0; i < stored_fields_.size(); ++i) { if (schema_[i].name == name) - return stored_fields_[i]; + return {stored_fields_[i]}; } for (uint64_t i = stored_fields_.size(); i < schema_.size(); ++i) @@ -84,16 +85,19 @@ class metadata io::read_packed_binary(stream_, si); stored_fields_.emplace_back(si); break; + case field_type::UNSIGNED_INT: uint64_t ui; io::read_packed_binary(stream_, ui); stored_fields_.emplace_back(ui); break; + case field_type::DOUBLE: double d; io::read_packed_binary(stream_, d); stored_fields_.emplace_back(d); break; + case field_type::STRING: std::string s{stream_.input_}; stream_.input_ += s.size() + 1; @@ -102,10 +106,10 @@ class metadata } if (schema_[i].name == name) - return stored_fields_[i]; + return {stored_fields_[i]}; } - throw exception{"metadata column \"" + name + "\" not found"}; + return util::nullopt; } class exception : public std::runtime_error @@ -114,7 +118,6 @@ class metadata using std::runtime_error::runtime_error; }; - private: /** * Tagged union to represent a single metadata field. */ @@ -150,6 +153,28 @@ class metadata new (&str) std::string(std::move(s)); } + field(field&& other) : type{other.type} + { + switch (type) + { + case field_type::SIGNED_INT: + sign_int = other.sign_int; + break; + + case field_type::UNSIGNED_INT: + usign_int = other.usign_int; + break; + + case field_type::DOUBLE: + doub = other.doub; + break; + + case field_type::STRING: + new (&str) std::string(std::move(other.str)); + break; + } + } + ~field() { // invoke string destructor if needed @@ -178,6 +203,7 @@ class metadata } }; + private: struct metadata_input_stream { metadata_input_stream(const char* input) : input_{input} diff --git a/include/index/metadata_file.h b/include/index/metadata_file.h index 93c4c5b0b..433a4fc6c 100644 --- a/include/index/metadata_file.h +++ b/include/index/metadata_file.h @@ -65,6 +65,11 @@ class metadata_file */ metadata get(doc_id d_id) const; + /** + * @return the number of documents in this database + */ + uint64_t size() const; + private: /// the schema for this file metadata::schema schema_; diff --git a/include/index/metadata_parser.h b/include/index/metadata_parser.h new file mode 100644 index 000000000..65a3dc707 --- /dev/null +++ b/include/index/metadata_parser.h @@ -0,0 +1,34 @@ +/** + * @file metadata_parser.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_INDEX_METADATA_PARSER_H_ +#define META_INDEX_METADATA_PARSER_H_ + +#include "index/metadata.h" +#include "io/parser.h" + +namespace meta +{ +namespace index +{ + +class metadata_parser +{ + public: + metadata_parser(const std::string& filename, + const metadata::schema& schema); + + std::vector next(); + private: + io::parser parser_; + const metadata::schema& schema_; +}; +} +} +#endif diff --git a/include/index/metadata_writer.h b/include/index/metadata_writer.h new file mode 100644 index 000000000..52d5a0d25 --- /dev/null +++ b/include/index/metadata_writer.h @@ -0,0 +1,41 @@ +/** + * @file metadata_writer.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_INDEX_METADATA_WRITER_H_ +#define META_INDEX_METADATA_WRITER_H_ + +#include +#include "corpus/document.h" +#include "index/metadata.h" +#include "util/disk_vector.h" + +namespace meta +{ +namespace index +{ + +class metadata_writer +{ + public: + metadata_writer(const std::string& prefix, uint64_t num_docs, + const metadata::schema& schema); + + void write(doc_id d_id, uint64_t length, uint64_t num_unique, + const std::vector& mdata); + + private: + std::mutex lock_; + util::disk_vector seek_pos_; + uint64_t byte_pos_; + std::ofstream db_file_; + const metadata::schema& schema_; +}; +} +} +#endif diff --git a/include/io/binary.h b/include/io/binary.h index 9a4e47008..cf0e3b340 100644 --- a/include/io/binary.h +++ b/include/io/binary.h @@ -156,7 +156,7 @@ void read_packed_binary(InputStream& in, T& elem) elem = 0; for (uint8_t idx = 0; idx < size; ++idx) { - auto byte = static_cast(in.get()); + uint64_t byte = static_cast(in.get()); byte <<= 8 * idx; elem |= byte; } diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index c7b188d81..a6532b118 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -8,6 +8,9 @@ add_library(meta-index disk_index.cpp forward_index.cpp inverted_index.cpp metadata.cpp + metadata_file.cpp + metadata_parser.cpp + metadata_writer.cpp string_list.cpp string_list_writer.cpp vocabulary_map.cpp diff --git a/src/index/disk_index.cpp b/src/index/disk_index.cpp index faa046d11..c63b2bc0e 100644 --- a/src/index/disk_index.cpp +++ b/src/index/disk_index.cpp @@ -75,7 +75,7 @@ std::vector disk_index::class_labels() const uint64_t disk_index::unique_terms(doc_id d_id) const { - return impl_->unique_terms_->at(d_id); + return *impl_->metadata_->get(d_id).get("unique-terms"); } uint64_t disk_index::unique_terms() const @@ -85,12 +85,12 @@ uint64_t disk_index::unique_terms() const uint64_t disk_index::doc_size(doc_id d_id) const { - return impl_->doc_sizes_->at(d_id); + return *impl_->metadata_->get(d_id).get("length"); } uint64_t disk_index::num_docs() const { - return impl_->doc_sizes_->size(); + return impl_->metadata_->size(); } std::string disk_index::doc_name(doc_id d_id) const @@ -101,12 +101,14 @@ std::string disk_index::doc_name(doc_id d_id) const std::string disk_index::doc_path(doc_id d_id) const { - return impl_->doc_id_mapping_->at(d_id); + if (auto path = impl_->metadata_->get(d_id).get("path")) + return *path; + return "[none]"; } std::vector disk_index::docs() const { - std::vector ret(impl_->doc_id_mapping_->size()); + std::vector ret(num_docs()); std::iota(ret.begin(), ret.end(), 0); return ret; } @@ -114,9 +116,9 @@ std::vector disk_index::docs() const // disk_index_impl const std::vector disk_index::disk_index_impl::files - = {"/docids.mapping", "/docids.mapping_index", "/docsizes.counts", - "/docs.labels", "/docs.uniqueterms", "/labelids.mapping", - "/postings.index", "/termids.mapping", "/termids.mapping.inverse"}; + = {"/docs.labels", "/labelids.mapping", "/postings.index", + "/postings.index_index", "/termids.mapping", "/termids.mapping.inverse", + "/metadata.db", "/metadata.index"}; label_id disk_index::disk_index_impl::get_label_id(const class_label& lbl) { @@ -132,17 +134,9 @@ label_id disk_index::disk_index_impl::get_label_id(const class_label& lbl) return label_ids_.get_value(lbl); } -void disk_index::disk_index_impl::initialize_metadata(uint64_t num_docs) +void disk_index::disk_index_impl::initialize_metadata() { - load_doc_sizes(num_docs); - load_labels(num_docs); - load_unique_terms(num_docs); -} - -void disk_index::disk_index_impl::load_doc_sizes(uint64_t num_docs) -{ - doc_sizes_ - = util::disk_vector{index_name_ + files[DOC_SIZES], num_docs}; + metadata_ = {index_name_}; } void disk_index::disk_index_impl::load_labels(uint64_t num_docs) @@ -151,17 +145,6 @@ void disk_index::disk_index_impl::load_labels(uint64_t num_docs) num_docs}; } -void disk_index::disk_index_impl::load_unique_terms(uint64_t num_docs) -{ - unique_terms_ = util::disk_vector{ - index_name_ + files[DOC_UNIQUETERMS], num_docs}; -} - -void disk_index::disk_index_impl::load_doc_id_mapping() -{ - doc_id_mapping_ = string_list{index_name_ + files[DOC_IDS_MAPPING]}; -} - void disk_index::disk_index_impl::load_term_id_mapping() { term_id_mapping_ = vocabulary_map{index_name_ + files[TERM_IDS_MAPPING]}; @@ -177,27 +160,11 @@ void disk_index::disk_index_impl::save_label_id_mapping() map::save_mapping(label_ids_, index_name_ + files[LABEL_IDS_MAPPING]); } -string_list_writer - disk_index::disk_index_impl::make_doc_id_writer(uint64_t num_docs) const -{ - return {index_name_ + files[DOC_IDS_MAPPING], num_docs}; -} - void disk_index::disk_index_impl::set_label(doc_id id, const class_label& label) { (*labels_)[id] = get_label_id(label); } -void disk_index::disk_index_impl::set_length(doc_id id, uint64_t length) -{ - (*doc_sizes_)[id] = length; -} - -void disk_index::disk_index_impl::set_unique_terms(doc_id id, uint64_t terms) -{ - (*unique_terms_)[id] = terms; -} - uint64_t disk_index::disk_index_impl::total_unique_terms() const { return term_id_mapping_->size(); diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 2cb9f9e5b..c195cf8ab 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -8,6 +8,7 @@ #include "index/disk_index_impl.h" #include "index/forward_index.h" #include "index/inverted_index.h" +#include "index/metadata_writer.h" #include "index/postings_file.h" #include "index/postings_file_writer.h" #include "index/string_list.h" @@ -144,7 +145,6 @@ void forward_index::load_index() LOG(info) << "Loading index from disk: " << index_name() << ENDLG; impl_->initialize_metadata(); - impl_->load_doc_id_mapping(); auto config = cpptoml::parse_file(index_name() + "/config.toml"); if (!fwd_impl_->is_libsvm_format(config)) @@ -189,7 +189,6 @@ void forward_index::create_index(const std::string& config_file) impl_->load_label_id_mapping(); fwd_impl_->load_postings(); - impl_->load_doc_id_mapping(); impl_->initialize_metadata(); { @@ -217,18 +216,20 @@ void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) auto filename = idx_->index_name() + idx_->impl_->files[POSTINGS]; uint64_t num_docs = filesystem::num_lines(libsvm_data); - idx_->impl_->initialize_metadata(num_docs); + idx_->impl_->load_labels(num_docs); total_unique_terms_ = 0; { postings_file_writer out{filename, num_docs}; + // make md_writer with empty schema + metadata_writer md_writer{idx_->index_name(), num_docs, {}}; + printing::progress progress{" > Creating postings from libsvm data: ", num_docs}; doc_id d_id{0}; std::ifstream input{libsvm_data}; std::string line; - auto docid_writer = idx_->impl_->make_doc_id_writer(num_docs); while (std::getline(input, line)) { progress(d_id); @@ -252,10 +253,8 @@ void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) pdata.set_counts(counts); out.write(pdata); - docid_writer.insert(d_id, "[no path]"); - idx_->impl_->set_length(d_id, static_cast(length)); - idx_->impl_->set_unique_terms(d_id, num_unique); - + md_writer.write(d_id, static_cast(length), num_unique, + {}); ++d_id; } @@ -271,9 +270,8 @@ void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) void forward_index::impl::create_uninverted_metadata(const std::string& name) { - auto files = {DOC_IDS_MAPPING, DOC_IDS_MAPPING_INDEX, DOC_SIZES, DOC_LABELS, - DOC_UNIQUETERMS, LABEL_IDS_MAPPING, TERM_IDS_MAPPING, - TERM_IDS_MAPPING_INVERSE}; + auto files = {DOC_LABELS, LABEL_IDS_MAPPING, TERM_IDS_MAPPING, + TERM_IDS_MAPPING_INVERSE, METADATA_DB, METADATA_INDEX}; for (const auto& file : files) filesystem::copy_file(name + idx_->impl_->files[file], diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index abab6dc7e..3e1a9a148 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -8,7 +8,8 @@ #include "index/chunk_handler.h" #include "index/disk_index_impl.h" #include "index/inverted_index.h" -#include "index/metadata.h" +#include "index/metadata_parser.h" +#include "index/metadata_writer.h" #include "index/postings_file.h" #include "index/postings_file_writer.h" #include "index/string_list.h" @@ -47,10 +48,14 @@ class inverted_index::impl /** * @param docs The documents to be tokenized * @param handler The chunk handler for this index + * @param mdata_parser The parser for reading metadata + * @param mdata_writer The writer for metadata * @return the number of chunks created */ void tokenize_docs(corpus::corpus* docs, - chunk_handler& handler); + chunk_handler& handler, + metadata_parser& mdata_parser, + metadata_writer& mdata_writer); /** * Compresses the large postings file. @@ -125,13 +130,18 @@ void inverted_index::create_index(const std::string& config_file) = cpptoml::parse_file(*prefix + "/" + *dataset + "/" + *corpus); auto schema = metadata_schema(corpus_config); - uint64_t num_docs = docs->size(); - impl_->initialize_metadata(num_docs); - chunk_handler handler{index_name()}; - inv_impl_->tokenize_docs(docs.get(), handler); - impl_->load_doc_id_mapping(); + { + metadata_parser mdata_parser{*prefix + "/" + *dataset + "/metadata.dat", + schema}; + metadata_writer mdata_writer{index_name(), docs->size(), schema}; + uint64_t num_docs = docs->size(); + impl_->load_labels(num_docs); + + inv_impl_->tokenize_docs(docs.get(), handler, mdata_parser, + mdata_writer); + } handler.merge_chunks(); @@ -144,6 +154,7 @@ void inverted_index::create_index(const std::string& config_file) num_unique_terms); impl_->load_term_id_mapping(); + impl_->initialize_metadata(); impl_->save_label_id_mapping(); inv_impl_->load_postings(); @@ -158,18 +169,17 @@ void inverted_index::load_index() auto config = cpptoml::parse_file(index_name() + "/config.toml"); impl_->initialize_metadata(); - impl_->load_doc_id_mapping(); impl_->load_term_id_mapping(); impl_->load_label_id_mapping(); inv_impl_->load_postings(); } void inverted_index::impl::tokenize_docs(corpus::corpus* docs, - chunk_handler& handler) + chunk_handler& handler, + metadata_parser& mdata_parser, + metadata_writer& mdata_writer) { std::mutex mutex; - auto docid_writer = idx_->impl_->make_doc_id_writer(docs->size()); - printing::progress progress{" > Tokenizing Docs: ", docs->size()}; auto task = [&]() @@ -179,6 +189,7 @@ void inverted_index::impl::tokenize_docs(corpus::corpus* docs, while (true) { util::optional doc; + util::optional> mdata; { std::lock_guard lock{mutex}; @@ -186,6 +197,7 @@ void inverted_index::impl::tokenize_docs(corpus::corpus* docs, return; // destructor for producer will write // any intermediate chunks doc = docs->next(); + mdata = mdata_parser.next(); progress(doc->id()); } @@ -201,10 +213,10 @@ void inverted_index::impl::tokenize_docs(corpus::corpus* docs, } // save metadata - docid_writer.insert(doc->id(), doc->path()); - idx_->impl_->set_length(doc->id(), doc->length()); - idx_->impl_->set_unique_terms(doc->id(), doc->counts().size()); + mdata_writer.write(doc->id(), doc->length(), doc->counts().size(), + *mdata); idx_->impl_->set_label(doc->id(), doc->label()); + // update chunk producer(doc->id(), doc->counts()); } diff --git a/src/index/metadata_file.cpp b/src/index/metadata_file.cpp index d2090d646..9dc1a97ef 100644 --- a/src/index/metadata_file.cpp +++ b/src/index/metadata_file.cpp @@ -15,17 +15,22 @@ namespace { struct char_input_stream { - char_input_stream(const char* input) : input_{input} + char_input_stream(const char* input, const char* end) + : input_{input}, end_{end} { // nothing } char get() { + if (input_ == end_) + throw metadata::exception{"seeking past end of metadata file"}; + return *input_++; } const char* input_; + const char* end_; }; } @@ -33,7 +38,7 @@ metadata_file::metadata_file(const std::string& prefix) : index_{prefix + "/metadata.index"}, md_db_{prefix + "/metadata.db"} { // read in the header to populate the schema - char_input_stream stream{md_db_.begin()}; + char_input_stream stream{md_db_.begin(), md_db_.begin() + md_db_.size()}; uint64_t num_fields; io::read_packed_binary(stream, num_fields); @@ -43,7 +48,9 @@ metadata_file::metadata_file(const std::string& prefix) metadata::field_info info; info.name = std::string{stream.input_}; stream.input_ += info.name.size() + 1; - io::read_packed_binary(stream, info.type); + static_assert(sizeof(metadata::field_type) == sizeof(uint8_t), + "metadata::field_type size not updated in metadata_file"); + info.type = static_cast(stream.get()); schema_.emplace_back(std::move(info)); } } @@ -56,5 +63,10 @@ metadata metadata_file::get(doc_id d_id) const uint64_t seek_pos = index_[d_id]; return {md_db_.begin() + seek_pos, schema_}; } + +uint64_t metadata_file::size() const +{ + return index_.size(); +} } } diff --git a/src/index/metadata_parser.cpp b/src/index/metadata_parser.cpp new file mode 100644 index 000000000..fc8e196ff --- /dev/null +++ b/src/index/metadata_parser.cpp @@ -0,0 +1,52 @@ +/** + * @file metadata_parser.cpp + * @author Chase Geigle + */ + +#include "index/metadata_parser.h" + +namespace meta +{ +namespace index +{ + +metadata_parser::metadata_parser(const std::string& filename, + const metadata::schema& schema) + : parser_{filename, "\n\t"}, schema_{schema} +{ + // nothing +} + +std::vector metadata_parser::next() +{ + std::vector mdata; + mdata.reserve(schema_.size()); + for (const auto& finfo : schema_) + { + if (!parser_.has_next()) + throw metadata::exception{"metadata input file ended prematurely"}; + auto str = parser_.next(); + + switch (finfo.type) + { + case metadata::field_type::SIGNED_INT: + mdata.emplace_back(static_cast(std::stol(str))); + break; + + case metadata::field_type::UNSIGNED_INT: + mdata.emplace_back(static_cast(std::stoul(str))); + break; + + case metadata::field_type::DOUBLE: + mdata.emplace_back(std::stod(str)); + break; + + case metadata::field_type::STRING: + mdata.emplace_back(std::move(str)); + break; + } + } + return mdata; +} +} +} diff --git a/src/index/metadata_writer.cpp b/src/index/metadata_writer.cpp new file mode 100644 index 000000000..86260ac17 --- /dev/null +++ b/src/index/metadata_writer.cpp @@ -0,0 +1,73 @@ +/** + * @file metadata_writer.cpp + * @author Chase Geigle + */ + +#include "index/metadata_writer.h" +#include "io/binary.h" + +namespace meta +{ +namespace index +{ + +metadata_writer::metadata_writer(const std::string& prefix, uint64_t num_docs, + const metadata::schema& schema) + : seek_pos_{prefix + "/metadata.index", num_docs}, + byte_pos_{0}, + db_file_{prefix + "/metadata.db", std::ios::binary}, + schema_{schema} +{ + // write metadata header + byte_pos_ += io::write_packed_binary(db_file_, schema_.size() + 2); + byte_pos_ += io::write_binary(db_file_, std::string{"length"}); + byte_pos_ += io::write_binary(db_file_, metadata::field_type::UNSIGNED_INT); + byte_pos_ += io::write_binary(db_file_, std::string{"unique-terms"}); + byte_pos_ += io::write_binary(db_file_, metadata::field_type::UNSIGNED_INT); + + for (const auto& finfo : schema_) + { + byte_pos_ += io::write_binary(db_file_, finfo.name); + byte_pos_ += io::write_binary(db_file_, finfo.type); + } +} + +void metadata_writer::write(doc_id d_id, uint64_t length, uint64_t num_unique, + const std::vector& mdata) +{ + std::lock_guard lock{lock_}; + + seek_pos_[d_id] = byte_pos_; + // write "mandatory" metadata + assert(length < 100000); + byte_pos_ += io::write_packed_binary(db_file_, length); + byte_pos_ += io::write_packed_binary(db_file_, num_unique); + + // write optional metadata + if (mdata.size() != schema_.size()) + throw metadata::exception{"schema mismatch when writing metadata"}; + + for (const auto& fld : mdata) + { + switch (fld.type) + { + case metadata::field_type::SIGNED_INT: + byte_pos_ += io::write_packed_binary(db_file_, fld.sign_int); + break; + + case metadata::field_type::UNSIGNED_INT: + byte_pos_ += io::write_packed_binary(db_file_, fld.usign_int); + break; + + case metadata::field_type::DOUBLE: + byte_pos_ += io::write_packed_binary(db_file_, fld.doub); + break; + + case metadata::field_type::STRING: + byte_pos_ += io::write_binary(db_file_, fld.str); + break; + } + } +} +} +} diff --git a/src/test/inverted_index_test.cpp b/src/test/inverted_index_test.cpp index 9dcd552fd..e615994fa 100644 --- a/src/test/inverted_index_test.cpp +++ b/src/test/inverted_index_test.cpp @@ -48,9 +48,8 @@ void create_config(const std::string& corpus_type) << "\"\n" << "query-judgements = \"" << *query_judgements << "\"\n" << "libsvm-modules = \"" << *libsvm_modules << "\"\n" - << "corpus-type = \"" << corpus_type << "-corpus\"\n" - << "list= \"ceeaus\"\n" << "dataset = \"ceeaus\"\n" + << "corpus = \"" << corpus_type << ".toml\"\n" << "encoding = \"shift_jis\"\n" << "forward-index = \"ceeaus-fwd\"\n" << "inverted-index = \"ceeaus-inv\"\n" From e6b988376f2dfa81d80368c7bd9c3024e8d7e820 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 11 Apr 2015 23:10:24 -0500 Subject: [PATCH 073/481] Make forward-index and classifiers unit tests pass. --- src/index/forward_index.cpp | 2 ++ src/index/metadata_writer.cpp | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index c195cf8ab..3ab5bf9e0 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -145,6 +145,7 @@ void forward_index::load_index() LOG(info) << "Loading index from disk: " << index_name() << ENDLG; impl_->initialize_metadata(); + impl_->load_labels(); auto config = cpptoml::parse_file(index_name() + "/config.toml"); if (!fwd_impl_->is_libsvm_format(config)) @@ -190,6 +191,7 @@ void forward_index::create_index(const std::string& config_file) impl_->load_label_id_mapping(); fwd_impl_->load_postings(); impl_->initialize_metadata(); + impl_->load_labels(); { std::ofstream unique_terms_file{index_name() + "/corpus.uniqueterms"}; diff --git a/src/index/metadata_writer.cpp b/src/index/metadata_writer.cpp index 86260ac17..acbc759f8 100644 --- a/src/index/metadata_writer.cpp +++ b/src/index/metadata_writer.cpp @@ -39,7 +39,6 @@ void metadata_writer::write(doc_id d_id, uint64_t length, uint64_t num_unique, seek_pos_[d_id] = byte_pos_; // write "mandatory" metadata - assert(length < 100000); byte_pos_ += io::write_packed_binary(db_file_, length); byte_pos_ += io::write_packed_binary(db_file_, num_unique); From 63e024577b6c1445734e0fad58ade3c44df19f07 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 11 Apr 2015 23:49:54 -0500 Subject: [PATCH 074/481] Add document path metadata storage. --- include/index/metadata_file.h | 4 ++-- include/index/metadata_writer.h | 1 + src/index/forward_index.cpp | 2 +- src/index/inverted_index.cpp | 3 +-- src/index/metadata_writer.cpp | 4 ++++ 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/index/metadata_file.h b/include/index/metadata_file.h index 433a4fc6c..6a4f7f3c7 100644 --- a/include/index/metadata_file.h +++ b/include/index/metadata_file.h @@ -29,7 +29,7 @@ namespace index * * - metadata.db: * - =>
^ - * -
=> ^( + 2) + * -
=> ^( + 3) * - => PackedInt * - => * - => String @@ -43,7 +43,7 @@ namespace index * is the number of user-supplied metadata fields (they must * be present for all documents). We add two in the grammar above since we * always represent the length (integer) and unique-terms (integer) as - * metadata. The "length" and "unique-terms" metadata names are + * metadata. The "length", "unique-terms", and "path" metadata names are * **reserved**, but there can be more metadata if the user supplies it. */ class metadata_file diff --git a/include/index/metadata_writer.h b/include/index/metadata_writer.h index 52d5a0d25..5c481e2ad 100644 --- a/include/index/metadata_writer.h +++ b/include/index/metadata_writer.h @@ -27,6 +27,7 @@ class metadata_writer const metadata::schema& schema); void write(doc_id d_id, uint64_t length, uint64_t num_unique, + const std::string& path, const std::vector& mdata); private: diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 3ab5bf9e0..3ebac3f8b 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -255,7 +255,7 @@ void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) pdata.set_counts(counts); out.write(pdata); - md_writer.write(d_id, static_cast(length), num_unique, + md_writer.write(d_id, static_cast(length), num_unique, "", {}); ++d_id; } diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 3e1a9a148..c10b13251 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -212,9 +212,8 @@ void inverted_index::impl::tokenize_docs(corpus::corpus* docs, << ") generated!" << ENDLG; } - // save metadata mdata_writer.write(doc->id(), doc->length(), doc->counts().size(), - *mdata); + doc->path(), *mdata); idx_->impl_->set_label(doc->id(), doc->label()); // update chunk diff --git a/src/index/metadata_writer.cpp b/src/index/metadata_writer.cpp index acbc759f8..2cd530c17 100644 --- a/src/index/metadata_writer.cpp +++ b/src/index/metadata_writer.cpp @@ -24,6 +24,8 @@ metadata_writer::metadata_writer(const std::string& prefix, uint64_t num_docs, byte_pos_ += io::write_binary(db_file_, metadata::field_type::UNSIGNED_INT); byte_pos_ += io::write_binary(db_file_, std::string{"unique-terms"}); byte_pos_ += io::write_binary(db_file_, metadata::field_type::UNSIGNED_INT); + byte_pos_ += io::write_binary(db_file_, std::string{"path"}); + byte_pos_ += io::write_binary(db_file_, metadata::field_type::STRING); for (const auto& finfo : schema_) { @@ -33,6 +35,7 @@ metadata_writer::metadata_writer(const std::string& prefix, uint64_t num_docs, } void metadata_writer::write(doc_id d_id, uint64_t length, uint64_t num_unique, + const std::string& path, const std::vector& mdata) { std::lock_guard lock{lock_}; @@ -41,6 +44,7 @@ void metadata_writer::write(doc_id d_id, uint64_t length, uint64_t num_unique, // write "mandatory" metadata byte_pos_ += io::write_packed_binary(db_file_, length); byte_pos_ += io::write_packed_binary(db_file_, num_unique); + byte_pos_ += io::write_binary(db_file_, path); // write optional metadata if (mdata.size() != schema_.size()) From d9f99ffe9f0745d4b1da6f6f9010075472a4294c Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 14 Apr 2015 03:13:37 -0500 Subject: [PATCH 075/481] Add what identifier is unrecognized in factory exceptions (issue #80). --- include/util/factory.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/util/factory.h b/include/util/factory.h index da1c6c320..127e960e3 100644 --- a/include/util/factory.h +++ b/include/util/factory.h @@ -12,6 +12,7 @@ #include #include +#include #include namespace meta @@ -76,7 +77,7 @@ class factory pointer create(const std::string& identifier, Args&&... args) { if (methods_.find(identifier) == methods_.end()) - throw exception{"unrecognized identifier"}; + throw exception{"unrecognized identifier: \"" + identifier + "\""}; return methods_[identifier](std::forward(args)...); } From b4e1e015fe32ed38f2a7f9806f33477c8ebb2d9b Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 14 Apr 2015 18:20:10 -0500 Subject: [PATCH 076/481] Optimize metadata for rankers. --- include/index/metadata.h | 26 +++++++++----------------- src/index/ranker/ranker.cpp | 16 ++++++++++++++-- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/include/index/metadata.h b/include/index/metadata.h index 977053a1b..5e5a7cf3d 100644 --- a/include/index/metadata.h +++ b/include/index/metadata.h @@ -70,43 +70,38 @@ class metadata template util::optional get(const std::string& name) { - for (uint64_t i = 0; i < stored_fields_.size(); ++i) - { - if (schema_[i].name == name) - return {stored_fields_[i]}; - } - - for (uint64_t i = stored_fields_.size(); i < schema_.size(); ++i) + for (uint64_t i = 0; i < schema_.size(); ++i) { switch (schema_[i].type) { case field_type::SIGNED_INT: int64_t si; io::read_packed_binary(stream_, si); - stored_fields_.emplace_back(si); + if (schema_[i].name == name) + return {field{si}}; break; case field_type::UNSIGNED_INT: uint64_t ui; io::read_packed_binary(stream_, ui); - stored_fields_.emplace_back(ui); + if (schema_[i].name == name) + return {field{ui}}; break; case field_type::DOUBLE: double d; io::read_packed_binary(stream_, d); - stored_fields_.emplace_back(d); + if (schema_[i].name == name) + return {field{d}}; break; case field_type::STRING: std::string s{stream_.input_}; stream_.input_ += s.size() + 1; - stored_fields_.emplace_back(std::move(s)); + if (schema_[i].name == name) + return {field{std::move(s)}}; break; } - - if (schema_[i].name == name) - return {stored_fields_[i]}; } return util::nullopt; @@ -224,9 +219,6 @@ class metadata /// the fake input stream used for read_packed_binary metadata_input_stream stream_; - - /// storage for decoded fields - std::vector stored_fields_; }; /** diff --git a/src/index/ranker/ranker.cpp b/src/index/ranker/ranker.cpp index 425117b4f..9eb5bdd17 100644 --- a/src/index/ranker/ranker.cpp +++ b/src/index/ranker/ranker.cpp @@ -3,6 +3,7 @@ * @author Sean Massung */ +#include #include "corpus/document.h" #include "index/inverted_index.h" #include "index/postings_data.h" @@ -26,6 +27,10 @@ ranker::score(inverted_index& idx, corpus::document& query, idx.num_docs(), idx.total_corpus_terms(), query}; + // a map from doc_id -> (length, unique_terms) to prevent looking up + // metadata repeatedly in the ranking loop + std::unordered_map> md_map; + // zeros out elements and (if necessary) resizes the vector; this eliminates // constructing a new vector each query for the same index results_.assign(sd.num_docs, std::numeric_limits::lowest()); @@ -42,8 +47,15 @@ ranker::score(inverted_index& idx, corpus::document& query, { sd.d_id = dpair.first; sd.doc_term_count = dpair.second; - sd.doc_size = idx.doc_size(dpair.first); - sd.doc_unique_terms = idx.unique_terms(dpair.first); + + auto& md = md_map[dpair.first]; + if (md.first == 0) + { + md.first = idx.doc_size(dpair.first); + md.second = idx.unique_terms(dpair.first); + } + sd.doc_size = md.first; + sd.doc_unique_terms = md.second; // if this is the first time we've seen this document, compute // its initial score From 620f2c93c737ef1de20b1bf3d96827e07f9ee55c Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 14 Apr 2015 18:39:42 -0500 Subject: [PATCH 077/481] Fix incorrect metadata size being written. --- src/index/metadata_writer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index/metadata_writer.cpp b/src/index/metadata_writer.cpp index 2cd530c17..ead0b22bc 100644 --- a/src/index/metadata_writer.cpp +++ b/src/index/metadata_writer.cpp @@ -19,7 +19,7 @@ metadata_writer::metadata_writer(const std::string& prefix, uint64_t num_docs, schema_{schema} { // write metadata header - byte_pos_ += io::write_packed_binary(db_file_, schema_.size() + 2); + byte_pos_ += io::write_packed_binary(db_file_, schema_.size() + 3); byte_pos_ += io::write_binary(db_file_, std::string{"length"}); byte_pos_ += io::write_binary(db_file_, metadata::field_type::UNSIGNED_INT); byte_pos_ += io::write_binary(db_file_, std::string{"unique-terms"}); From 303cf4bb2806da4b26cabcbb65d5e2303c634cfc Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 14 Apr 2015 18:44:47 -0500 Subject: [PATCH 078/481] Don't require metadata.dat to exist for every corpus. --- include/index/metadata_parser.h | 3 +- src/index/metadata_parser.cpp | 54 ++++++++++++++++++--------------- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/include/index/metadata_parser.h b/include/index/metadata_parser.h index 65a3dc707..4bb02b6ed 100644 --- a/include/index/metadata_parser.h +++ b/include/index/metadata_parser.h @@ -12,6 +12,7 @@ #include "index/metadata.h" #include "io/parser.h" +#include "util/optional.h" namespace meta { @@ -26,7 +27,7 @@ class metadata_parser std::vector next(); private: - io::parser parser_; + util::optional parser_; const metadata::schema& schema_; }; } diff --git a/src/index/metadata_parser.cpp b/src/index/metadata_parser.cpp index fc8e196ff..4609a2127 100644 --- a/src/index/metadata_parser.cpp +++ b/src/index/metadata_parser.cpp @@ -4,6 +4,7 @@ */ #include "index/metadata_parser.h" +#include "util/filesystem.h" namespace meta { @@ -12,38 +13,43 @@ namespace index metadata_parser::metadata_parser(const std::string& filename, const metadata::schema& schema) - : parser_{filename, "\n\t"}, schema_{schema} + : schema_{schema} { - // nothing + if (filesystem::file_exists(filename)) + parser_ = io::parser{filename, "\n\t"}; } std::vector metadata_parser::next() { std::vector mdata; - mdata.reserve(schema_.size()); - for (const auto& finfo : schema_) + if (parser_) { - if (!parser_.has_next()) - throw metadata::exception{"metadata input file ended prematurely"}; - auto str = parser_.next(); - - switch (finfo.type) + mdata.reserve(schema_.size()); + for (const auto& finfo : schema_) { - case metadata::field_type::SIGNED_INT: - mdata.emplace_back(static_cast(std::stol(str))); - break; - - case metadata::field_type::UNSIGNED_INT: - mdata.emplace_back(static_cast(std::stoul(str))); - break; - - case metadata::field_type::DOUBLE: - mdata.emplace_back(std::stod(str)); - break; - - case metadata::field_type::STRING: - mdata.emplace_back(std::move(str)); - break; + if (!parser_->has_next()) + throw metadata::exception{ + "metadata input file ended prematurely"}; + auto str = parser_->next(); + + switch (finfo.type) + { + case metadata::field_type::SIGNED_INT: + mdata.emplace_back(static_cast(std::stol(str))); + break; + + case metadata::field_type::UNSIGNED_INT: + mdata.emplace_back(static_cast(std::stoul(str))); + break; + + case metadata::field_type::DOUBLE: + mdata.emplace_back(std::stod(str)); + break; + + case metadata::field_type::STRING: + mdata.emplace_back(std::move(str)); + break; + } } } return mdata; From 67dd1bfa023824df50cebae2737674b3700bfab4 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 14 Apr 2015 23:03:43 -0500 Subject: [PATCH 079/481] Move metadata parsing to be a corpus responsibility. Restructure the document class to abstract out metadata and not always require a name/path. Paths are set automatically by file_corpus as additional metadata, and there's now a way for corpus classes to hook in and add their own metadata that is corpus-specific. --- include/analyzers/analyzer.h | 12 --- include/corpus/corpus.h | 18 +++++ include/corpus/document.h | 38 ++++----- include/corpus/file_corpus.h | 5 ++ include/corpus/gz_corpus.h | 3 - include/corpus/line_corpus.h | 3 - include/{index => corpus}/metadata.h | 85 ++++++++++++++++++++- include/{index => corpus}/metadata_parser.h | 15 ++-- include/index/metadata_file.h | 8 +- include/index/metadata_writer.h | 9 +-- include/util/filesystem.h | 1 - src/analyzers/analyzer.cpp | 20 +---- src/classify/classifier/knn.cpp | 2 +- src/corpus/CMakeLists.txt | 21 ++--- src/corpus/corpus.cpp | 34 +++++++-- src/corpus/document.cpp | 32 +++----- src/corpus/file_corpus.cpp | 25 +++++- src/corpus/gz_corpus.cpp | 10 +-- src/corpus/line_corpus.cpp | 15 +--- src/{index => corpus}/metadata.cpp | 6 +- src/{index => corpus}/metadata_parser.cpp | 13 +++- src/index/CMakeLists.txt | 2 - src/index/forward_index.cpp | 2 +- src/index/inverted_index.cpp | 26 ++----- src/index/metadata_file.cpp | 14 ++-- src/index/metadata_writer.cpp | 29 ++++--- src/index/tools/interactive-search.cpp | 2 +- src/index/tools/query-runner.cpp | 5 +- src/index/tools/search.cpp | 69 +++++++++-------- src/test/analyzer_test.cpp | 15 +--- src/test/ir_eval_test.cpp | 5 +- src/test/ranker_test.cpp | 5 +- 32 files changed, 311 insertions(+), 238 deletions(-) rename include/{index => corpus}/metadata.h (70%) rename include/{index => corpus}/metadata_parser.h (66%) rename src/{index => corpus}/metadata.cpp (94%) rename src/{index => corpus}/metadata_parser.cpp (85%) diff --git a/include/analyzers/analyzer.h b/include/analyzers/analyzer.h index d37ac4589..67c738ea8 100644 --- a/include/analyzers/analyzer.h +++ b/include/analyzers/analyzer.h @@ -89,18 +89,6 @@ class analyzer load_filter(std::unique_ptr src, const cpptoml::table& config); - /** - * @param doc The document to parse - * @param extension The possible file extension for this document if it - * is represented by a file on disk - * @param delims Possible character delimiters to use when parsing the - * file - * @return a parser suited to read data that this document represents - */ - static io::parser create_parser(const corpus::document& doc, - const std::string& extension, - const std::string& delims); - /** * @param doc The document to get content for * @return the contents of the document, as a string diff --git a/include/corpus/corpus.h b/include/corpus/corpus.h index 5bdb952ab..6ebf6160b 100644 --- a/include/corpus/corpus.h +++ b/include/corpus/corpus.h @@ -15,6 +15,8 @@ #include "meta.h" #include "corpus/document.h" +#include "corpus/metadata_parser.h" +#include "util/optional.h" namespace meta { @@ -48,6 +50,11 @@ class corpus */ virtual uint64_t size() const = 0; + /** + * @return the corpus' metadata schema + */ + virtual metadata::schema schema() const; + /** * Destructor. */ @@ -74,9 +81,20 @@ class corpus using std::runtime_error::runtime_error; }; + protected: + /** + * Helper function to be used by deriving classes in implementing + * next() to set the metadata for the current document. + */ + std::vector next_metadata(); + private: + void set_metadata_parser(metadata_parser&& mdparser); + /// The type of encoding this document uses std::string encoding_; + /// The metadata parser + util::optional mdata_parser_; }; } } diff --git a/include/corpus/document.h b/include/corpus/document.h index 72db7eb34..af758a2b3 100644 --- a/include/corpus/document.h +++ b/include/corpus/document.h @@ -12,8 +12,10 @@ #include #include +#include #include "meta.h" +#include "corpus/metadata.h" #include "util/optional.h" namespace meta @@ -33,11 +35,10 @@ class document public: /** * Constructor. - * @param path The path to the document * @param d_id The doc id to assign to this document * @param label The optional class label to assign this document */ - document(const std::string& path = "[NONE]", doc_id d_id = doc_id{0}, + document(doc_id d_id = doc_id{0}, const class_label& label = class_label{"[NONE]"}); /** @@ -47,26 +48,11 @@ class document */ void increment(const std::string& term, double amount); - /** - * @return the path to this document (the argument to the constructor) - */ - std::string path() const; - /** * @return the classification category this document is in */ const class_label& label() const; - /** - * @return the name of this document - */ - std::string name() const; - - /** - * @param n The new name for this document - */ - void name(const std::string& n); - /** * @return the total of transitions recorded for this document. * This is not the number of unique transitions. @@ -128,18 +114,26 @@ class document */ void label(class_label label); - private: - /// Where this document is on disk - std::string path_; + /** + * @return the set of extra metadata fields for this document + */ + const std::vector& metadata() const; + /** + * Sets the extra metadata fields for this document + * @param metadata The new metadata for this document + */ + void metadata(std::vector&& metadata); + + private: /// The document id for this document doc_id d_id_; /// Which category this document would be classified into class_label label_; - /// The short name for this document (not the full path) - std::string name_; + /// Other metadata fields for this document + std::vector mdata_; /// The number of (non-unique) tokens in this document size_t length_; diff --git a/include/corpus/file_corpus.h b/include/corpus/file_corpus.h index f998f1636..d88f4bf21 100644 --- a/include/corpus/file_corpus.h +++ b/include/corpus/file_corpus.h @@ -51,6 +51,11 @@ class file_corpus : public corpus */ uint64_t size() const override; + /** + * @return the metadata schema for this corpus + */ + metadata::schema schema() const override; + private: /// the current document we are on uint64_t cur_; diff --git a/include/corpus/gz_corpus.h b/include/corpus/gz_corpus.h index c17fbe608..82485bbe2 100644 --- a/include/corpus/gz_corpus.h +++ b/include/corpus/gz_corpus.h @@ -59,9 +59,6 @@ class gz_corpus : public corpus /// The stream to read the class labels io::gzifstream class_stream_; - - /// The stream to read the document names - io::gzifstream name_stream_; }; } } diff --git a/include/corpus/line_corpus.h b/include/corpus/line_corpus.h index 023ec36b9..cf8037a66 100644 --- a/include/corpus/line_corpus.h +++ b/include/corpus/line_corpus.h @@ -67,9 +67,6 @@ class line_corpus : public corpus /// Parser to read the class labels std::unique_ptr class_parser_; - - /// Parser to read the document names - std::unique_ptr name_parser_; }; } } diff --git a/include/index/metadata.h b/include/corpus/metadata.h similarity index 70% rename from include/index/metadata.h rename to include/corpus/metadata.h index 5e5a7cf3d..b11a586b7 100644 --- a/include/index/metadata.h +++ b/include/corpus/metadata.h @@ -7,8 +7,8 @@ * project. */ -#ifndef META_INDEX_METADATA_H_ -#define META_INDEX_METADATA_H_ +#ifndef META_CORPUS_METADATA_H_ +#define META_CORPUS_METADATA_H_ #include #include @@ -20,7 +20,7 @@ namespace meta { -namespace index +namespace corpus { class metadata @@ -143,7 +143,7 @@ class metadata // nothing } - field(std::string&& s) : type{field_type::STRING} + field(std::string s) : type{field_type::STRING} { new (&str) std::string(std::move(s)); } @@ -170,6 +170,83 @@ class metadata } } + field(const field& other) : type{other.type} + { + switch (type) + { + case field_type::SIGNED_INT: + sign_int = other.sign_int; + break; + + case field_type::UNSIGNED_INT: + usign_int = other.usign_int; + break; + + case field_type::DOUBLE: + doub = other.doub; + break; + + case field_type::STRING: + new (&str) std::string(other.str); + break; + } + } + + field& operator=(field&& other) + { + if (type == field_type::STRING) + (&str)->~basic_string(); + + switch (other.type) + { + case field_type::SIGNED_INT: + sign_int = other.sign_int; + break; + + case field_type::UNSIGNED_INT: + usign_int = other.usign_int; + break; + + case field_type::DOUBLE: + doub = other.doub; + break; + + case field_type::STRING: + new (&str) std::string(std::move(other.str)); + break; + } + + type = other.type; + return *this; + } + + field& operator=(const field& other) + { + if (type == field_type::STRING) + (&str)->~basic_string(); + + switch (other.type) + { + case field_type::SIGNED_INT: + sign_int = other.sign_int; + break; + + case field_type::UNSIGNED_INT: + usign_int = other.usign_int; + break; + + case field_type::DOUBLE: + doub = other.doub; + break; + + case field_type::STRING: + new (&str) std::string(other.str); + break; + } + + return *this; + } + ~field() { // invoke string destructor if needed diff --git a/include/index/metadata_parser.h b/include/corpus/metadata_parser.h similarity index 66% rename from include/index/metadata_parser.h rename to include/corpus/metadata_parser.h index 4bb02b6ed..8bbc13b57 100644 --- a/include/index/metadata_parser.h +++ b/include/corpus/metadata_parser.h @@ -7,28 +7,31 @@ * project. */ -#ifndef META_INDEX_METADATA_PARSER_H_ -#define META_INDEX_METADATA_PARSER_H_ +#ifndef META_CORPUS_METADATA_PARSER_H_ +#define META_CORPUS_METADATA_PARSER_H_ -#include "index/metadata.h" +#include "corpus/metadata.h" #include "io/parser.h" #include "util/optional.h" namespace meta { -namespace index +namespace corpus { class metadata_parser { public: metadata_parser(const std::string& filename, - const metadata::schema& schema); + metadata::schema schema); std::vector next(); + + const metadata::schema& schema() const; + private: util::optional parser_; - const metadata::schema& schema_; + metadata::schema schema_; }; } } diff --git a/include/index/metadata_file.h b/include/index/metadata_file.h index 6a4f7f3c7..281e40889 100644 --- a/include/index/metadata_file.h +++ b/include/index/metadata_file.h @@ -11,7 +11,7 @@ #define META_INDEX_METADATA_FILE_H_ #include "util/disk_vector.h" -#include "index/metadata.h" +#include "corpus/metadata.h" #include "io/mmap_file.h" namespace meta @@ -29,7 +29,7 @@ namespace index * * - metadata.db: * - =>
^ - * -
=> ^( + 3) + * -
=> ^( + 2) * - => PackedInt * - => * - => String @@ -63,7 +63,7 @@ class metadata_file * @param d_id The document id to look up metadata for * @return the metadata for the document */ - metadata get(doc_id d_id) const; + corpus::metadata get(doc_id d_id) const; /** * @return the number of documents in this database @@ -72,7 +72,7 @@ class metadata_file private: /// the schema for this file - metadata::schema schema_; + corpus::metadata::schema schema_; /// the seek positions for every document in this file util::disk_vector index_; diff --git a/include/index/metadata_writer.h b/include/index/metadata_writer.h index 5c481e2ad..038ffc794 100644 --- a/include/index/metadata_writer.h +++ b/include/index/metadata_writer.h @@ -12,7 +12,7 @@ #include #include "corpus/document.h" -#include "index/metadata.h" +#include "corpus/metadata.h" #include "util/disk_vector.h" namespace meta @@ -24,18 +24,17 @@ class metadata_writer { public: metadata_writer(const std::string& prefix, uint64_t num_docs, - const metadata::schema& schema); + corpus::metadata::schema schema); void write(doc_id d_id, uint64_t length, uint64_t num_unique, - const std::string& path, - const std::vector& mdata); + const std::vector& mdata); private: std::mutex lock_; util::disk_vector seek_pos_; uint64_t byte_pos_; std::ofstream db_file_; - const metadata::schema& schema_; + corpus::metadata::schema schema_; }; } } diff --git a/include/util/filesystem.h b/include/util/filesystem.h index 312d53af6..f99129c80 100644 --- a/include/util/filesystem.h +++ b/include/util/filesystem.h @@ -169,5 +169,4 @@ inline uint64_t num_lines(const std::string& filename, char delimiter = '\n') } } } - #endif diff --git a/src/analyzers/analyzer.cpp b/src/analyzers/analyzer.cpp index c40626912..848d1c7a9 100644 --- a/src/analyzers/analyzer.cpp +++ b/src/analyzers/analyzer.cpp @@ -26,23 +26,11 @@ namespace analyzers std::string analyzer::get_content(const corpus::document& doc) { - if (doc.contains_content()) - return utf::to_utf8(doc.content(), doc.encoding()); + if (!doc.contains_content()) + throw analyzer_exception{ + "document content was not populated for analysis"}; - io::mmap_file file{doc.path()}; - return utf::to_utf8({file.begin(), file.size()}, doc.encoding()); -} - -io::parser analyzer::create_parser(const corpus::document& doc, - const std::string& extension, - const std::string& delims) -{ - if (doc.contains_content()) - return io::parser{doc.content(), delims, - io::parser::input_type::String}; - else - return io::parser{doc.path() + extension, delims, - io::parser::input_type::File}; + return utf::to_utf8(doc.content(), doc.encoding()); } std::unique_ptr diff --git a/src/classify/classifier/knn.cpp b/src/classify/classifier/knn.cpp index 231ca648f..6553e3de5 100644 --- a/src/classify/classifier/knn.cpp +++ b/src/classify/classifier/knn.cpp @@ -42,7 +42,7 @@ class_label knn::classify(doc_id d_id) "k must be smaller than the " "number of documents in the index (training documents)"}; - corpus::document query{"[no path]", d_id}; + corpus::document query{d_id}; for (const auto& count : idx_->search_primary(d_id)->counts()) query.increment(idx_->term_text(count.first), count.second); diff --git a/src/corpus/CMakeLists.txt b/src/corpus/CMakeLists.txt index e539e14ca..add04d8ff 100644 --- a/src/corpus/CMakeLists.txt +++ b/src/corpus/CMakeLists.txt @@ -2,17 +2,18 @@ project(meta-corpus) add_subdirectory(tools) +set(CORPUS_SOURCES corpus.cpp + document.cpp + file_corpus.cpp + line_corpus.cpp + metadata.cpp + metadata_parser.cpp) + if (ZLIB_FOUND) - add_library(meta-corpus corpus.cpp - document.cpp - file_corpus.cpp - line_corpus.cpp - gz_corpus.cpp) -else() - add_library(meta-corpus corpus.cpp - document.cpp - file_corpus.cpp - line_corpus.cpp) + list(APPEND CORPUS_SOURCES gz_corpus.cpp) endif() + +add_library(meta-corpus ${CORPUS_SOURCES}) + # some corpus classes use io::parser target_link_libraries(meta-corpus meta-io) diff --git a/src/corpus/corpus.cpp b/src/corpus/corpus.cpp index 33f32708e..e9fce1ca6 100644 --- a/src/corpus/corpus.cpp +++ b/src/corpus/corpus.cpp @@ -19,11 +19,26 @@ corpus::corpus(std::string encoding) : encoding_{std::move(encoding)} // nothing } +std::vector corpus::next_metadata() +{ + return mdata_parser_->next(); +} + +metadata::schema corpus::schema() const +{ + return mdata_parser_->schema(); +} + const std::string& corpus::encoding() const { return encoding_; } +void corpus::set_metadata_parser(metadata_parser&& parser) +{ + mdata_parser_ = std::move(parser); +} + std::unique_ptr corpus::load(const std::string& config_file) { auto config = cpptoml::parse_file(config_file); @@ -57,6 +72,8 @@ std::unique_ptr corpus::load(const std::string& config_file) else encoding = "utf-8"; + std::unique_ptr result; + if (*type == "file-corpus") { auto file_list = corpus_config.get_as("list"); @@ -66,8 +83,8 @@ std::unique_ptr corpus::load(const std::string& config_file) std::string file = *prefix + "/" + *dataset + "/" + *file_list + "-full-corpus.txt"; - return make_unique(*prefix + "/" + *dataset + "/", file, - encoding); + result = make_unique(*prefix + "/" + *dataset + "/", file, + encoding); } else if (*type == "line-corpus") { @@ -75,20 +92,25 @@ std::unique_ptr corpus::load(const std::string& config_file) + ".dat"; auto lines = corpus_config.get_as("num-lines"); if (!lines) - return make_unique(filename, encoding); - return make_unique(filename, encoding, - static_cast(*lines)); + result = make_unique(filename, encoding); + else + result = make_unique(filename, encoding, + static_cast(*lines)); } #if META_HAS_ZLIB else if (*type == "gz-corpus") { std::string filename = *prefix + "/" + *dataset + "/" + *dataset + ".dat"; - return make_unique(filename, encoding); + result = make_unique(filename, encoding); } #endif else throw corpus_exception{"corpus type was not able to be determined"}; + + result->set_metadata_parser({*prefix + "/" + *dataset + "/metadata.dat", + metadata_schema(corpus_config)}); + return result; } } } diff --git a/src/corpus/document.cpp b/src/corpus/document.cpp index 34432cca7..19accca67 100644 --- a/src/corpus/document.cpp +++ b/src/corpus/document.cpp @@ -12,12 +12,9 @@ namespace meta namespace corpus { -document::document(const std::string& path, doc_id d_id, - const class_label& label) - : path_{path}, d_id_{d_id}, label_{label}, length_{0}, encoding_{"utf-8"} +document::document(doc_id d_id, const class_label& label) + : d_id_{d_id}, label_{label}, length_{0}, encoding_{"utf-8"} { - size_t idx = path.find_last_of("/") + 1; - name_ = path.substr(idx); } void document::increment(const std::string& term, double amount) @@ -26,26 +23,11 @@ void document::increment(const std::string& term, double amount) length_ += amount; } -std::string document::path() const -{ - return path_; -} - const class_label& document::label() const { return label_; } -std::string document::name() const -{ - return name_; -} - -void document::name(const std::string& n) -{ - name_ = n; -} - uint64_t document::length() const { return length_; @@ -100,5 +82,15 @@ void document::label(class_label label) { label_ = label; } + +const std::vector& document::metadata() const +{ + return mdata_; +} + +void document::metadata(std::vector&& metadata) +{ + mdata_ = std::move(metadata); +} } } diff --git a/src/corpus/file_corpus.cpp b/src/corpus/file_corpus.cpp index 112faed35..c52b0ae69 100644 --- a/src/corpus/file_corpus.cpp +++ b/src/corpus/file_corpus.cpp @@ -5,6 +5,8 @@ #include "corpus/file_corpus.h" #include "io/parser.h" +#include "util/filesystem.h" +#include "utf/utf.h" namespace meta { @@ -43,8 +45,19 @@ bool file_corpus::has_next() const document file_corpus::next() { - document doc{prefix_ + docs_[cur_].first, doc_id{cur_}, docs_[cur_].second}; - doc.encoding(encoding()); + document doc{doc_id{cur_}, docs_[cur_].second}; + + if (!filesystem::file_exists(prefix_ + docs_[cur_].first)) + throw corpus_exception{"file \"" + docs_[cur_].first + + "\" does not exist"}; + + doc.content(filesystem::file_text(prefix_ + docs_[cur_].first), encoding()); + + auto mdata = next_metadata(); + // add "path" metadata manually + mdata.insert(mdata.begin(), metadata::field{prefix_ + docs_[cur_].first}); + doc.metadata(std::move(mdata)); + ++cur_; return doc; } @@ -53,5 +66,13 @@ uint64_t file_corpus::size() const { return docs_.size(); } + +metadata::schema file_corpus::schema() const +{ + auto schema = corpus::schema(); + schema.insert(schema.begin(), + metadata::field_info{"path", metadata::field_type::STRING}); + return schema; +} } } diff --git a/src/corpus/gz_corpus.cpp b/src/corpus/gz_corpus.cpp index e619e3bb4..0779394cb 100644 --- a/src/corpus/gz_corpus.cpp +++ b/src/corpus/gz_corpus.cpp @@ -15,8 +15,7 @@ gz_corpus::gz_corpus(const std::string& file, std::string encoding) : corpus{std::move(encoding)}, cur_id_{0}, corpus_stream_{file + ".gz"}, - class_stream_{file + ".labels.gz"}, - name_stream_{file + ".names.gz"} + class_stream_{file + ".labels.gz"} { if (!filesystem::file_exists(file + ".numdocs")) throw corpus::corpus_exception{ @@ -41,19 +40,16 @@ bool gz_corpus::has_next() const document gz_corpus::next() { class_label label{"[none]"}; - std::string name{"[none]"}; if (class_stream_) std::getline(class_stream_, static_cast(label)); - if (name_stream_) - std::getline(name_stream_, name); - std::string line; std::getline(corpus_stream_, line); - document doc{name, cur_id_++, label}; + document doc{cur_id_++, label}; doc.content(line, encoding()); + doc.metadata(next_metadata()); return doc; } diff --git a/src/corpus/line_corpus.cpp b/src/corpus/line_corpus.cpp index 98f54ddf0..b292b549b 100644 --- a/src/corpus/line_corpus.cpp +++ b/src/corpus/line_corpus.cpp @@ -30,14 +30,6 @@ line_corpus::line_corpus(const std::string& file, std::string encoding, num_lines_ = filesystem::num_lines(file + ".labels"); } - // init class label info - if (filesystem::file_exists(file + ".names")) - { - name_parser_ = make_unique(file + ".names", "\n"); - if (num_lines_ == 0) - num_lines_ = filesystem::num_lines(file + ".names"); - } - // if we couldn't determine the number of lines in the constructor and the // two optional files don't exist, we have to count newlines here if (num_lines_ == 0) @@ -52,16 +44,13 @@ bool line_corpus::has_next() const document line_corpus::next() { class_label label{"[none]"}; - std::string name{"[none]"}; if (class_parser_) label = class_label{class_parser_->next()}; - if (name_parser_) - name = name_parser_->next(); - - document doc{name, cur_id_++, label}; + document doc{cur_id_++, label}; doc.content(parser_.next(), encoding()); + doc.metadata(next_metadata()); return doc; } diff --git a/src/index/metadata.cpp b/src/corpus/metadata.cpp similarity index 94% rename from src/index/metadata.cpp rename to src/corpus/metadata.cpp index e7896b753..86557385c 100644 --- a/src/index/metadata.cpp +++ b/src/corpus/metadata.cpp @@ -3,11 +3,11 @@ * @author Chase Geigle */ -#include "index/metadata.h" +#include "corpus/metadata.h" namespace meta { -namespace index +namespace corpus { metadata::schema metadata_schema(const cpptoml::table& config) @@ -28,7 +28,7 @@ metadata::schema metadata_schema(const cpptoml::table& config) if (!type) throw metadata::exception{"type needed for metadata field"}; - index::metadata::field_type ftype; + metadata::field_type ftype; if (*type == "int") { ftype = metadata::field_type::SIGNED_INT; diff --git a/src/index/metadata_parser.cpp b/src/corpus/metadata_parser.cpp similarity index 85% rename from src/index/metadata_parser.cpp rename to src/corpus/metadata_parser.cpp index 4609a2127..f06907de7 100644 --- a/src/index/metadata_parser.cpp +++ b/src/corpus/metadata_parser.cpp @@ -3,17 +3,17 @@ * @author Chase Geigle */ -#include "index/metadata_parser.h" +#include "corpus/metadata_parser.h" #include "util/filesystem.h" namespace meta { -namespace index +namespace corpus { metadata_parser::metadata_parser(const std::string& filename, - const metadata::schema& schema) - : schema_{schema} + metadata::schema schema) + : schema_{std::move(schema)} { if (filesystem::file_exists(filename)) parser_ = io::parser{filename, "\n\t"}; @@ -54,5 +54,10 @@ std::vector metadata_parser::next() } return mdata; } + +const metadata::schema& metadata_parser::schema() const +{ + return schema_; +} } } diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index a6532b118..657a4405a 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -7,9 +7,7 @@ add_subdirectory(tools) add_library(meta-index disk_index.cpp forward_index.cpp inverted_index.cpp - metadata.cpp metadata_file.cpp - metadata_parser.cpp metadata_writer.cpp string_list.cpp string_list_writer.cpp diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 3ebac3f8b..3ab5bf9e0 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -255,7 +255,7 @@ void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) pdata.set_counts(counts); out.write(pdata); - md_writer.write(d_id, static_cast(length), num_unique, "", + md_writer.write(d_id, static_cast(length), num_unique, {}); ++d_id; } diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index c10b13251..028aa0b47 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -8,7 +8,7 @@ #include "index/chunk_handler.h" #include "index/disk_index_impl.h" #include "index/inverted_index.h" -#include "index/metadata_parser.h" +#include "corpus/metadata_parser.h" #include "index/metadata_writer.h" #include "index/postings_file.h" #include "index/postings_file_writer.h" @@ -54,7 +54,6 @@ class inverted_index::impl */ void tokenize_docs(corpus::corpus* docs, chunk_handler& handler, - metadata_parser& mdata_parser, metadata_writer& mdata_writer); /** @@ -121,26 +120,14 @@ void inverted_index::create_index(const std::string& config_file) // load the documents from the corpus auto docs = corpus::corpus::load(config_file); - // load the metadata schema for the corpus - auto config = cpptoml::parse_file(config_file); - auto prefix = config.get_as("prefix"); - auto dataset = config.get_as("dataset"); - auto corpus = config.get_as("corpus"); - auto corpus_config - = cpptoml::parse_file(*prefix + "/" + *dataset + "/" + *corpus); - auto schema = metadata_schema(corpus_config); - chunk_handler handler{index_name()}; - { - metadata_parser mdata_parser{*prefix + "/" + *dataset + "/metadata.dat", - schema}; - metadata_writer mdata_writer{index_name(), docs->size(), schema}; + metadata_writer mdata_writer{index_name(), docs->size(), + docs->schema()}; uint64_t num_docs = docs->size(); impl_->load_labels(num_docs); - inv_impl_->tokenize_docs(docs.get(), handler, mdata_parser, - mdata_writer); + inv_impl_->tokenize_docs(docs.get(), handler, mdata_writer); } handler.merge_chunks(); @@ -176,7 +163,6 @@ void inverted_index::load_index() void inverted_index::impl::tokenize_docs(corpus::corpus* docs, chunk_handler& handler, - metadata_parser& mdata_parser, metadata_writer& mdata_writer) { std::mutex mutex; @@ -189,7 +175,6 @@ void inverted_index::impl::tokenize_docs(corpus::corpus* docs, while (true) { util::optional doc; - util::optional> mdata; { std::lock_guard lock{mutex}; @@ -197,7 +182,6 @@ void inverted_index::impl::tokenize_docs(corpus::corpus* docs, return; // destructor for producer will write // any intermediate chunks doc = docs->next(); - mdata = mdata_parser.next(); progress(doc->id()); } @@ -213,7 +197,7 @@ void inverted_index::impl::tokenize_docs(corpus::corpus* docs, } mdata_writer.write(doc->id(), doc->length(), doc->counts().size(), - doc->path(), *mdata); + doc->metadata()); idx_->impl_->set_label(doc->id(), doc->label()); // update chunk diff --git a/src/index/metadata_file.cpp b/src/index/metadata_file.cpp index 9dc1a97ef..f7f5858dc 100644 --- a/src/index/metadata_file.cpp +++ b/src/index/metadata_file.cpp @@ -24,7 +24,8 @@ struct char_input_stream char get() { if (input_ == end_) - throw metadata::exception{"seeking past end of metadata file"}; + throw corpus::metadata::exception{ + "seeking past end of metadata file"}; return *input_++; } @@ -45,20 +46,21 @@ metadata_file::metadata_file(const std::string& prefix) schema_.reserve(num_fields); for (uint64_t i = 0; i < num_fields; ++i) { - metadata::field_info info; + corpus::metadata::field_info info; info.name = std::string{stream.input_}; stream.input_ += info.name.size() + 1; - static_assert(sizeof(metadata::field_type) == sizeof(uint8_t), + static_assert(sizeof(corpus::metadata::field_type) == sizeof(uint8_t), "metadata::field_type size not updated in metadata_file"); - info.type = static_cast(stream.get()); + info.type = static_cast(stream.get()); schema_.emplace_back(std::move(info)); } } -metadata metadata_file::get(doc_id d_id) const +corpus::metadata metadata_file::get(doc_id d_id) const { if (d_id >= index_.size()) - throw metadata::exception{"invalid doc id in metadata retrieval"}; + throw corpus::metadata::exception{ + "invalid doc id in metadata retrieval"}; uint64_t seek_pos = index_[d_id]; return {md_db_.begin() + seek_pos, schema_}; diff --git a/src/index/metadata_writer.cpp b/src/index/metadata_writer.cpp index ead0b22bc..af4d86475 100644 --- a/src/index/metadata_writer.cpp +++ b/src/index/metadata_writer.cpp @@ -12,20 +12,20 @@ namespace index { metadata_writer::metadata_writer(const std::string& prefix, uint64_t num_docs, - const metadata::schema& schema) + corpus::metadata::schema schema) : seek_pos_{prefix + "/metadata.index", num_docs}, byte_pos_{0}, db_file_{prefix + "/metadata.db", std::ios::binary}, - schema_{schema} + schema_{std::move(schema)} { // write metadata header - byte_pos_ += io::write_packed_binary(db_file_, schema_.size() + 3); + byte_pos_ += io::write_packed_binary(db_file_, schema_.size() + 2); byte_pos_ += io::write_binary(db_file_, std::string{"length"}); - byte_pos_ += io::write_binary(db_file_, metadata::field_type::UNSIGNED_INT); + byte_pos_ += io::write_binary(db_file_, + corpus::metadata::field_type::UNSIGNED_INT); byte_pos_ += io::write_binary(db_file_, std::string{"unique-terms"}); - byte_pos_ += io::write_binary(db_file_, metadata::field_type::UNSIGNED_INT); - byte_pos_ += io::write_binary(db_file_, std::string{"path"}); - byte_pos_ += io::write_binary(db_file_, metadata::field_type::STRING); + byte_pos_ += io::write_binary(db_file_, + corpus::metadata::field_type::UNSIGNED_INT); for (const auto& finfo : schema_) { @@ -35,8 +35,7 @@ metadata_writer::metadata_writer(const std::string& prefix, uint64_t num_docs, } void metadata_writer::write(doc_id d_id, uint64_t length, uint64_t num_unique, - const std::string& path, - const std::vector& mdata) + const std::vector& mdata) { std::lock_guard lock{lock_}; @@ -44,29 +43,29 @@ void metadata_writer::write(doc_id d_id, uint64_t length, uint64_t num_unique, // write "mandatory" metadata byte_pos_ += io::write_packed_binary(db_file_, length); byte_pos_ += io::write_packed_binary(db_file_, num_unique); - byte_pos_ += io::write_binary(db_file_, path); // write optional metadata if (mdata.size() != schema_.size()) - throw metadata::exception{"schema mismatch when writing metadata"}; + throw corpus::metadata::exception{ + "schema mismatch when writing metadata"}; for (const auto& fld : mdata) { switch (fld.type) { - case metadata::field_type::SIGNED_INT: + case corpus::metadata::field_type::SIGNED_INT: byte_pos_ += io::write_packed_binary(db_file_, fld.sign_int); break; - case metadata::field_type::UNSIGNED_INT: + case corpus::metadata::field_type::UNSIGNED_INT: byte_pos_ += io::write_packed_binary(db_file_, fld.usign_int); break; - case metadata::field_type::DOUBLE: + case corpus::metadata::field_type::DOUBLE: byte_pos_ += io::write_packed_binary(db_file_, fld.doub); break; - case metadata::field_type::STRING: + case corpus::metadata::field_type::STRING: byte_pos_ += io::write_binary(db_file_, fld.str); break; } diff --git a/src/index/tools/interactive-search.cpp b/src/index/tools/interactive-search.cpp index 698d725ea..764cfa0cb 100644 --- a/src/index/tools/interactive-search.cpp +++ b/src/index/tools/interactive-search.cpp @@ -77,7 +77,7 @@ int main(int argc, char* argv[]) if (text.empty()) break; - corpus::document query{"[user input]", doc_id{0}}; + corpus::document query{doc_id{0}}; query.content(text); // set the doc's content to be user input // Use the ranker to score the query over the index. diff --git a/src/index/tools/query-runner.cpp b/src/index/tools/query-runner.cpp index 2b36b5961..8fa0f892c 100644 --- a/src/index/tools/query-runner.cpp +++ b/src/index/tools/query-runner.cpp @@ -62,10 +62,9 @@ int main(int argc, char* argv[]) while (queries.good() && i <= 500) // only look at first 500 queries { std::getline(queries, content); - corpus::document query{"[user input]", doc_id{0}}; + corpus::document query{doc_id{0}}; query.content(content); - std::cout << "Ranking query " << i++ << ": " << query.path() - << std::endl; + std::cout << "Ranking query " << i++ << ": " << std::endl; // Use the ranker to score the query over the index. By default, the // ranker returns 10 documents, so we will display the "top 10 of diff --git a/src/index/tools/search.cpp b/src/index/tools/search.cpp index d1b12a958..39da973da 100644 --- a/src/index/tools/search.cpp +++ b/src/index/tools/search.cpp @@ -59,41 +59,46 @@ int main(int argc, char* argv[]) // Time how long it takes to create the index. By default, common::time's // unit of measurement is milliseconds. - auto elapsed = common::time([&]() - { - // Get a std::vector of doc_ids that have been indexed. - auto docs = idx->docs(); - - // Search for up to the first 20 documents; we hope that the first - // result is the original document itself since we're querying with - // documents that are already indexed. - for (size_t i = 0; i < 20 && i < idx->num_docs(); ++i) + auto elapsed = common::time( + [&]() { - // Create a document and specify its path; its content will be - // filled by the analyzer. - corpus::document query{idx->doc_path(docs[i]), doc_id{docs[i]}}; - query.encoding(encoding); - - std::cout << "Ranking query " << (i + 1) << ": " << query.path() - << std::endl; - - // Use the ranker to score the query over the index. By default, the - // ranker returns 10 documents, so we will display the "top 10 of - // 10" docs. - auto ranking = ranker->score(*idx, query); - std::cout << "Showing top 10 of " << ranking.size() << " results." - << std::endl; - - // Print out the top ten results. - for (size_t i = 0; i < ranking.size() && i < 10; ++i) + // Get a std::vector of doc_ids that have been indexed. + auto docs = idx->docs(); + + // Search for up to the first 20 documents; we hope that the first + // result is the original document itself since we're querying with + // documents that are already indexed. + for (size_t i = 0; i < 20 && i < idx->num_docs(); ++i) { - std::cout << (i + 1) << ". " << idx->doc_name(ranking[i].first) - << " " << ranking[i].second << std::endl; + auto path = idx->doc_path(docs[i]); + // Create a document and specify its path; its content will be + // filled by the analyzer. + corpus::document query{doc_id{docs[i]}}; + query.content(filesystem::file_text(path), encoding); + + std::cout << "Ranking query " << (i + 1) << ": " << path + << std::endl; + + // Use the ranker to score the query over the index. By default, + // the + // ranker returns 10 documents, so we will display the "top 10 + // of + // 10" docs. + auto ranking = ranker->score(*idx, query); + std::cout << "Showing top 10 of " << ranking.size() + << " results." << std::endl; + + // Print out the top ten results. + for (size_t i = 0; i < ranking.size() && i < 10; ++i) + { + std::cout << (i + 1) << ". " + << idx->doc_name(ranking[i].first) << " " + << ranking[i].second << std::endl; + } + + std::cout << std::endl; } - - std::cout << std::endl; - } - }); + }); std::cout << "Elapsed time: " << elapsed.count() / 1000.0 << " seconds" << std::endl; diff --git a/src/test/analyzer_test.cpp b/src/test/analyzer_test.cpp index ea7c593cc..ee75d6b5e 100644 --- a/src/test/analyzer_test.cpp +++ b/src/test/analyzer_test.cpp @@ -33,21 +33,11 @@ void check_analyzer_expected(Analyzer& ana, corpus::document doc, ASSERT_EQUAL(doc.counts().size(), num_unique); ASSERT_EQUAL(doc.length(), length); ASSERT_EQUAL(doc.id(), 47ul); - if (doc.contains_content()) - { - ASSERT_EQUAL(doc.path(), "/home/person/filename.txt"); - ASSERT_EQUAL(doc.name(), "filename.txt"); - } - else - { - ASSERT_EQUAL(doc.path(), "../data/sample-document.txt"); - ASSERT_EQUAL(doc.name(), "sample-document.txt"); - } } int content_tokenize() { - corpus::document doc{"/home/person/filename.txt", doc_id{47}}; + corpus::document doc{doc_id{47}}; // "one" is a stopword std::string content = "one one two two two three four one five"; @@ -78,7 +68,8 @@ int content_tokenize() int file_tokenize() { int num_failed = 0; - corpus::document doc{"../data/sample-document.txt", doc_id{47}}; + corpus::document doc{doc_id{47}}; + doc.content(filesystem::file_text("../data/sample-document.txt")); num_failed += testing::run_test("file-unigram-word-analyzer", [&]() { diff --git a/src/test/ir_eval_test.cpp b/src/test/ir_eval_test.cpp index d6eed71a7..c04b674f8 100644 --- a/src/test/ir_eval_test.cpp +++ b/src/test/ir_eval_test.cpp @@ -44,7 +44,10 @@ int ir_eval_bounds() // sanity test bounds for (size_t i = 0; i < 5; ++i) { - corpus::document query{idx->doc_path(doc_id{i}), doc_id{0}}; + auto path = idx->doc_path(doc_id{i}); + corpus::document query{doc_id{0}}; + query.content(filesystem::file_text(path)); + auto ranking = ranker.score(*idx, query); auto f1 = eval.f1(ranking, query_id{i}); auto p = eval.precision(ranking, query_id{i}); diff --git a/src/test/ranker_test.cpp b/src/test/ranker_test.cpp index e1be3a673..0c6ff9791 100644 --- a/src/test/ranker_test.cpp +++ b/src/test/ranker_test.cpp @@ -17,8 +17,9 @@ void test_rank(Ranker& r, Index& idx, const std::string& encoding) for (size_t i = 0; i < idx.num_docs(); ++i) { auto d_id = idx.docs()[i]; - corpus::document query{idx.doc_path(d_id), doc_id{i}}; - query.encoding(encoding); + auto path = idx.doc_path(d_id); + corpus::document query{doc_id{i}}; + query.content(filesystem::file_text(path), encoding); auto ranking = r.score(idx, query); ASSERT_EQUAL(ranking.size(), 10ul); // default is 10 docs From 8fed58378c8ecc43ed33d76807744822a6d27c42 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 14 Apr 2015 23:31:42 -0500 Subject: [PATCH 080/481] Add doxygen comments to metadata_parser. --- include/corpus/metadata_parser.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/corpus/metadata_parser.h b/include/corpus/metadata_parser.h index 8bbc13b57..51f9a6c9a 100644 --- a/include/corpus/metadata_parser.h +++ b/include/corpus/metadata_parser.h @@ -19,18 +19,35 @@ namespace meta namespace corpus { +/** + * Reads metadata from the metadata file of a corpus according to a schema. + */ class metadata_parser { public: + /** + * Creates the parser. + * @param filename The name of the file to parse + * @param schema The schema to parse the file with + */ metadata_parser(const std::string& filename, metadata::schema schema); + /** + * @return the metadata vector for the next document in the file + */ std::vector next(); + /** + * @return the schema for the metadata in this file + */ const metadata::schema& schema() const; private: + /// the parser used to extract metadata util::optional parser_; + + /// the schema for the metadata being extracted metadata::schema schema_; }; } From 5ad659126bff7a3ba28fc09594dfcb9d03bdaa1c Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 14 Apr 2015 23:31:55 -0500 Subject: [PATCH 081/481] Add doxygen comments to metadata_writer. --- include/index/metadata_writer.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/index/metadata_writer.h b/include/index/metadata_writer.h index 038ffc794..5f1c767bf 100644 --- a/include/index/metadata_writer.h +++ b/include/index/metadata_writer.h @@ -20,20 +20,45 @@ namespace meta namespace index { +/** + * Writes document metadata into the packed format for the index. + */ class metadata_writer { public: + /** + * Constructs the writer. + * @param prefix The directory to place the metadata database and index + * @param num_docs The number of documents we have metadata for + * @param schema The schema for the metadata we will store + */ metadata_writer(const std::string& prefix, uint64_t num_docs, corpus::metadata::schema schema); + /** + * Writes a document's metadata to the database and index. + * @param d_id The document id + * @param length The length of the document + * @param num_unique The number of unique terms in the document + * @param mdata Any additional metadata to be written + */ void write(doc_id d_id, uint64_t length, uint64_t num_unique, const std::vector& mdata); private: + /// a lock for thread safety std::mutex lock_; + + /// the index into the database file util::disk_vector seek_pos_; + + /// the current byte position in the database uint64_t byte_pos_; + + /// the output stream for the database file std::ofstream db_file_; + + /// the schema of the metadata we are writing corpus::metadata::schema schema_; }; } From 47118415f6424bddd7ca003bfdca0be7c04bf758 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 14 Apr 2015 23:33:23 -0500 Subject: [PATCH 082/481] Minor documentation improvements for metadata. --- include/corpus/metadata.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/corpus/metadata.h b/include/corpus/metadata.h index b11a586b7..56fba243c 100644 --- a/include/corpus/metadata.h +++ b/include/corpus/metadata.h @@ -23,6 +23,9 @@ namespace meta namespace corpus { +/** + * Represents the collection of metadata for a document. + */ class metadata { public: @@ -67,6 +70,11 @@ class metadata // nothing } + /** + * @param name The metadata field to obtain + * @return the metadata associated with that field, if it exists, + * converted to type T. + */ template util::optional get(const std::string& name) { From 0f424e15df075138d31d0ebc159e24b45b9c1bc5 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 15 Apr 2015 16:58:17 -0500 Subject: [PATCH 083/481] Add braces around case statement that initialized local variables. (I'm honestly not sure why this wasn't a compiler error.) --- include/corpus/metadata.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/corpus/metadata.h b/include/corpus/metadata.h index 56fba243c..345d9b198 100644 --- a/include/corpus/metadata.h +++ b/include/corpus/metadata.h @@ -83,32 +83,40 @@ class metadata switch (schema_[i].type) { case field_type::SIGNED_INT: + { int64_t si; io::read_packed_binary(stream_, si); if (schema_[i].name == name) return {field{si}}; break; + } case field_type::UNSIGNED_INT: + { uint64_t ui; io::read_packed_binary(stream_, ui); if (schema_[i].name == name) return {field{ui}}; break; + } case field_type::DOUBLE: + { double d; io::read_packed_binary(stream_, d); if (schema_[i].name == name) return {field{d}}; break; + } case field_type::STRING: + { std::string s{stream_.input_}; stream_.input_ += s.size() + 1; if (schema_[i].name == name) return {field{std::move(s)}}; break; + } } } From bde2abc4bfcb3b414f670341ab283f41ab1e56ad Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 15 Apr 2015 17:06:48 -0500 Subject: [PATCH 084/481] update SHA256 hash for CEEAUS testing files --- src/test/tools/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/tools/CMakeLists.txt b/src/test/tools/CMakeLists.txt index 90ef4e115..6a0ae6698 100644 --- a/src/test/tools/CMakeLists.txt +++ b/src/test/tools/CMakeLists.txt @@ -2,7 +2,7 @@ ExternalProject_Add(ceeaus SOURCE_DIR ${meta_BINARY_DIR}/../../data/ceeaus DOWNLOAD_DIR ${meta_BINARY_DIR}/downloads URL http://web.engr.illinois.edu/~massung1/files/ceeaus.tar.gz - URL_HASH "SHA256=dbcdecc4098bd02dd31c35930fad9ae81a85dc07ac79f734a127fe915a52ca25" + URL_HASH "SHA256=8ea40b32f34e9ae8aedffe562ad468fc465d1cc0ff6a5c3bdf0ee42bb85c231e" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "") From 765596fcbc56b6af1a2f75f94f096c015abecbc4 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 15 Apr 2015 17:41:36 -0500 Subject: [PATCH 085/481] Change document::metadata() to document::mdata(). This fixes -fpermissive errors with GCC. --- include/corpus/document.h | 4 ++-- src/corpus/corpus.cpp | 8 ++++---- src/corpus/document.cpp | 4 ++-- src/corpus/file_corpus.cpp | 2 +- src/corpus/gz_corpus.cpp | 2 +- src/corpus/line_corpus.cpp | 2 +- src/index/inverted_index.cpp | 2 +- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/corpus/document.h b/include/corpus/document.h index af758a2b3..aebeebfed 100644 --- a/include/corpus/document.h +++ b/include/corpus/document.h @@ -117,13 +117,13 @@ class document /** * @return the set of extra metadata fields for this document */ - const std::vector& metadata() const; + const std::vector& mdata() const; /** * Sets the extra metadata fields for this document * @param metadata The new metadata for this document */ - void metadata(std::vector&& metadata); + void mdata(std::vector&& metadata); private: /// The document id for this document diff --git a/src/corpus/corpus.cpp b/src/corpus/corpus.cpp index e9fce1ca6..feae632d4 100644 --- a/src/corpus/corpus.cpp +++ b/src/corpus/corpus.cpp @@ -43,8 +43,8 @@ std::unique_ptr corpus::load(const std::string& config_file) { auto config = cpptoml::parse_file(config_file); - auto corpus = config.get_as("corpus"); - if (!corpus) + auto corp = config.get_as("corpus"); + if (!corp) throw corpus_exception{"corpus missing from configuration file"}; auto prefix = config.get_as("prefix"); @@ -55,7 +55,7 @@ std::unique_ptr corpus::load(const std::string& config_file) if (!dataset) throw corpus_exception{"dataset missing from configuration file"}; - auto corpus_filename = *prefix + "/" + *dataset + "/" + *corpus; + auto corpus_filename = *prefix + "/" + *dataset + "/" + *corp; if (!filesystem::file_exists(corpus_filename)) throw corpus_exception{"corpus configuration file (" + corpus_filename + ") not present"}; @@ -72,7 +72,7 @@ std::unique_ptr corpus::load(const std::string& config_file) else encoding = "utf-8"; - std::unique_ptr result; + std::unique_ptr result; if (*type == "file-corpus") { diff --git a/src/corpus/document.cpp b/src/corpus/document.cpp index 19accca67..d1ad06326 100644 --- a/src/corpus/document.cpp +++ b/src/corpus/document.cpp @@ -83,12 +83,12 @@ void document::label(class_label label) label_ = label; } -const std::vector& document::metadata() const +const std::vector& document::mdata() const { return mdata_; } -void document::metadata(std::vector&& metadata) +void document::mdata(std::vector&& metadata) { mdata_ = std::move(metadata); } diff --git a/src/corpus/file_corpus.cpp b/src/corpus/file_corpus.cpp index c52b0ae69..9c34d58c1 100644 --- a/src/corpus/file_corpus.cpp +++ b/src/corpus/file_corpus.cpp @@ -56,7 +56,7 @@ document file_corpus::next() auto mdata = next_metadata(); // add "path" metadata manually mdata.insert(mdata.begin(), metadata::field{prefix_ + docs_[cur_].first}); - doc.metadata(std::move(mdata)); + doc.mdata(std::move(mdata)); ++cur_; return doc; diff --git a/src/corpus/gz_corpus.cpp b/src/corpus/gz_corpus.cpp index 0779394cb..49f4f5651 100644 --- a/src/corpus/gz_corpus.cpp +++ b/src/corpus/gz_corpus.cpp @@ -49,7 +49,7 @@ document gz_corpus::next() document doc{cur_id_++, label}; doc.content(line, encoding()); - doc.metadata(next_metadata()); + doc.mdata(next_metadata()); return doc; } diff --git a/src/corpus/line_corpus.cpp b/src/corpus/line_corpus.cpp index b292b549b..1fccffc65 100644 --- a/src/corpus/line_corpus.cpp +++ b/src/corpus/line_corpus.cpp @@ -50,7 +50,7 @@ document line_corpus::next() document doc{cur_id_++, label}; doc.content(parser_.next(), encoding()); - doc.metadata(next_metadata()); + doc.mdata(next_metadata()); return doc; } diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 028aa0b47..6889d5b36 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -197,7 +197,7 @@ void inverted_index::impl::tokenize_docs(corpus::corpus* docs, } mdata_writer.write(doc->id(), doc->length(), doc->counts().size(), - doc->metadata()); + doc->mdata()); idx_->impl_->set_label(doc->id(), doc->label()); // update chunk From 72747423abc61ba983d6366d2150f2c61e67eb26 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 15 Apr 2015 19:13:45 -0500 Subject: [PATCH 086/481] Update filename for CEEAUS testing files. --- src/test/tools/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/tools/CMakeLists.txt b/src/test/tools/CMakeLists.txt index 6a0ae6698..47f226799 100644 --- a/src/test/tools/CMakeLists.txt +++ b/src/test/tools/CMakeLists.txt @@ -1,7 +1,7 @@ ExternalProject_Add(ceeaus SOURCE_DIR ${meta_BINARY_DIR}/../../data/ceeaus DOWNLOAD_DIR ${meta_BINARY_DIR}/downloads - URL http://web.engr.illinois.edu/~massung1/files/ceeaus.tar.gz + URL http://web.engr.illinois.edu/~massung1/files/ceeaus-metadata.tar.gz URL_HASH "SHA256=8ea40b32f34e9ae8aedffe562ad468fc465d1cc0ff6a5c3bdf0ee42bb85c231e" CONFIGURE_COMMAND "" BUILD_COMMAND "" From 61d2fecbaf20b900729f1b5dc6d37fb0bf187f27 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 15 Apr 2015 19:41:29 -0500 Subject: [PATCH 087/481] Silence gcc warnings. --- include/corpus/metadata.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/corpus/metadata.h b/include/corpus/metadata.h index 345d9b198..5000bcfe8 100644 --- a/include/corpus/metadata.h +++ b/include/corpus/metadata.h @@ -65,7 +65,7 @@ class metadata using schema = std::vector; metadata(const char* start, const schema& sch) - : schema_{sch}, stream_{start} + : schema_{&sch}, stream_{start} { // nothing } @@ -78,15 +78,15 @@ class metadata template util::optional get(const std::string& name) { - for (uint64_t i = 0; i < schema_.size(); ++i) + for (uint64_t i = 0; i < schema_->size(); ++i) { - switch (schema_[i].type) + switch ((*schema_)[i].type) { case field_type::SIGNED_INT: { int64_t si; io::read_packed_binary(stream_, si); - if (schema_[i].name == name) + if ((*schema_)[i].name == name) return {field{si}}; break; } @@ -95,7 +95,7 @@ class metadata { uint64_t ui; io::read_packed_binary(stream_, ui); - if (schema_[i].name == name) + if ((*schema_)[i].name == name) return {field{ui}}; break; } @@ -104,7 +104,7 @@ class metadata { double d; io::read_packed_binary(stream_, d); - if (schema_[i].name == name) + if ((*schema_)[i].name == name) return {field{d}}; break; } @@ -113,7 +113,7 @@ class metadata { std::string s{stream_.input_}; stream_.input_ += s.size() + 1; - if (schema_[i].name == name) + if ((*schema_)[i].name == name) return {field{std::move(s)}}; break; } @@ -307,8 +307,8 @@ class metadata const char* input_; }; - /// reference to the metadata_file's schema - const schema& schema_; + /// pointer to the metadata_file's schema + const schema* schema_; /// the fake input stream used for read_packed_binary metadata_input_stream stream_; From 43fa49c6c1774673d9f42b6bad867efc23bc387a Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 15 Apr 2015 18:40:12 -0500 Subject: [PATCH 088/481] Switch travis-ci over to container infrastructure. This results in dramatically faster builds, which is good for everyone's sanity. --- .travis.yml | 48 +++++++++++++++++++++-------------- CMakeLists.txt | 27 +++++++++++++++++--- include/corpus/metadata.h | 18 ++++++------- src/test/tools/CMakeLists.txt | 2 +- 4 files changed, 62 insertions(+), 33 deletions(-) diff --git a/.travis.yml b/.travis.yml index 05421f2e4..8d8c433f0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,23 +3,33 @@ notifications: language: cpp +sudo: false + compiler: - clang - gcc +addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - g++-4.8 + - libicu-dev + - ninja-build + install: - - sudo apt-get update -qq - - sudo apt-get install libc6-i386 - - wget http://www.cmake.org/files/v3.0/cmake-3.0.1-Linux-i386.sh - - sudo sh cmake-3.0.1-Linux-i386.sh --prefix=/usr/local --exclude-subdir - - sudo apt-get install libicu-dev - # credit: https://github.com/beark/ftl/ - # install g++ 4.8, if tests are run with g++ - - if [ "`echo $CXX`" == "g++" ]; then sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test; fi - - if [ "`echo $CXX`" == "g++" ]; then sudo apt-get update; fi - - if [ "`echo $CXX`" == "g++" ]; then sudo apt-get install -qq g++-4.8; fi - - if [ "`echo $CXX`" == "g++" ]; then sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 50; fi - # install libc++ if tests are run with clang++ + - mkdir $HOME/lib + - export LD_LIBRARY_PATH=$HOME/lib:$LD_LIBRARY_PATH + - mkdir $HOME/bin + - export PATH=$HOME/bin:$PATH + - mkdir $HOME/include + - export CPLUS_INCLUDE_PATH=$HOME/include:$CPLUS_INCLUDE_PATH + - wget http://www.cmake.org/files/v3.2/cmake-3.2.2-Linux-x86_64.sh + - sh cmake-3.2.2-Linux-x86_64.sh --prefix=$HOME --exclude-subdir + # use g++-4.8 if g++ is our compiler + - if [ "`echo $CXX`" == "g++" ]; then export CXX=g++-4.8; fi + # install libc++ if tests are run with clang++ - if [ "`echo $CXX`" == "clang++" ]; then cwd=$(pwd); fi - if [ "`echo $CXX`" == "clang++" ]; then svn co --quiet http://llvm.org/svn/llvm-project/libcxx/trunk libcxx; fi - if [ "`echo $CXX`" == "clang++" ]; then git clone https://github.com/pathscale/libcxxrt.git libcxxrt; fi @@ -28,16 +38,16 @@ install: - if [ "`echo $CXX`" == "clang++" ]; then cd build; fi - if [ "`echo $CXX`" == "clang++" ]; then cmake -DCMAKE_BUILD_TYPE=Release ../; fi - if [ "`echo $CXX`" == "clang++" ]; then make; fi - - if [ "`echo $CXX`" == "clang++" ]; then sudo cp lib/libcxxrt.so /usr/lib; fi - - if [ "`echo $CXX`" == "clang++" ]; then sudo ln -sf /usr/lib/libcxxrt.so /usr/lib/libcxxrt.so.1; fi - - if [ "`echo $CXX`" == "clang++" ]; then sudo ln -sf /usr/lib/libcxxrt.so /usr/lib/libcxxrt.so.1.0; fi + - if [ "`echo $CXX`" == "clang++" ]; then cp lib/libcxxrt.so $HOME/lib; fi + - if [ "`echo $CXX`" == "clang++" ]; then ln -sf $HOME/lib/libcxxrt.so $HOME/lib/libcxxrt.so.1; fi + - if [ "`echo $CXX`" == "clang++" ]; then ln -sf $HOME/lib/libcxxrt.so $HOME/lib/libcxxrt.so.1.0; fi - if [ "`echo $CXX`" == "clang++" ]; then cd $cwd; fi - if [ "`echo $CXX`" == "clang++" ]; then cd libcxx; fi - if [ "`echo $CXX`" == "clang++" ]; then mkdir build; fi - if [ "`echo $CXX`" == "clang++" ]; then cd build; fi - - if [ "`echo $CXX`" == "clang++" ]; then cmake -DLIBCXX_CXX_ABI=libcxxrt -DLIBCXX_CXX_ABI_INCLUDE_PATHS="../../libcxxrt/src" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr ..; fi + - if [ "`echo $CXX`" == "clang++" ]; then cmake -DLIBCXX_CXX_ABI=libcxxrt -DLIBCXX_CXX_ABI_INCLUDE_PATHS="../../libcxxrt/src" -DLIBCXX_CXX_ABI_LIBRARY_PATH=$HOME/lib -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$HOME ..; fi - if [ "`echo $CXX`" == "clang++" ]; then make; fi - - if [ "`echo $CXX`" == "clang++" ]; then sudo make install; fi + - if [ "`echo $CXX`" == "clang++" ]; then make install; fi - if [ "`echo $CXX`" == "clang++" ]; then cd $cwd; fi before_script: @@ -46,6 +56,6 @@ before_script: - cp ../config.toml ./ script: - - cmake ../ -DCMAKE_BUILD_TYPE=Debug - - make + - cmake ../ -DCMAKE_BUILD_TYPE=Debug -G Ninja + - ninja - ctest --output-on-failure diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c9b30c52..53954a773 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,16 +75,24 @@ if(UNIX) message("-- Locating libc++...") find_library(LIBCXX_LIBRARY NAMES c++ cxx) if(LIBCXX_LIBRARY) - message("-- Located libc++, using it.") + message("-- Located libc++: ${LIBCXX_LIBRARY}") set(LIBCXX_OPTIONS "-stdlib=libc++") + get_filename_component(LIBCXX_LIB_PATH ${LIBCXX_LIBRARY} + DIRECTORY) + find_path(LIBCXX_PREFIX c++/v1/algorithm + PATHS ${LIBCXX_LIB_PATH}/../include + ${CMAKE_SYSTEM_PREFIX_PATH}) + set(LIBCXX_INCLUDE_DIR ${LIBCXX_PREFIX}/c++/v1/) + message("-- Located libc++ include path: ${LIBCXX_INCLUDE_DIR}") + message("-- Locating libc++'s abi...") find_library(LIBCXXABI_LIBRARY NAMES c++abi) find_library(LIBCXXRT_LIBRARY NAMES cxxrt) if(LIBCXXABI_LIBRARY) - message("-- Found libc++abi, using it.") + message("-- Found libc++abi: ${LIBCXXABI_LIBRARY}") set(CXXABI_LIBRARY ${LIBCXXABI_LIBRARY}) elseif(LIBCXXRT_LIBRARY) - message("-- Found libcxxrt, using it.") + message("-- Found libcxxrt: ${LIBCXXRT_LIBRARY}") set(CXXABI_LIBRARY ${LIBCXXRT_LIBRARY}) else() message("-- No abi library found. " @@ -102,7 +110,8 @@ if(UNIX) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LIBCXX_OPTIONS}") - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${CXXABI_LIBRARY}") + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${LIBCXX_OPTIONS} ${CXXABI_LIBRARY} -L${LIBCXX_LIB_PATH}") + set(CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES} ${LIBCXX_INCLUDE_DIR}") endif() add_library(meta-definitions INTERFACE) @@ -122,6 +131,16 @@ if(LIBDL_LIBRARY) target_link_libraries(meta-definitions INTERFACE ${LIBDL_LIBRARY}) endif() +if(CXXABI_LIBRARY) + target_link_libraries(meta-definitions INTERFACE ${CXXABI_LIBRARY}) +endif() + +if(LIBCXX_LIBRARY) + target_include_directories(meta-definitions SYSTEM INTERFACE ${LIBCXX_INCLUDE_DIR}) + target_compile_options(meta-definitions INTERFACE ${LIBCXX_OPTIONS}) + target_link_libraries(meta-definitions INTERFACE -L${LIBCXX_LIB_PATH}) +endif() + if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") target_compile_definitions(meta-definitions INTERFACE -D_DARWIN_USE_64_BIT_INODE=1) diff --git a/include/corpus/metadata.h b/include/corpus/metadata.h index 345d9b198..5000bcfe8 100644 --- a/include/corpus/metadata.h +++ b/include/corpus/metadata.h @@ -65,7 +65,7 @@ class metadata using schema = std::vector; metadata(const char* start, const schema& sch) - : schema_{sch}, stream_{start} + : schema_{&sch}, stream_{start} { // nothing } @@ -78,15 +78,15 @@ class metadata template util::optional get(const std::string& name) { - for (uint64_t i = 0; i < schema_.size(); ++i) + for (uint64_t i = 0; i < schema_->size(); ++i) { - switch (schema_[i].type) + switch ((*schema_)[i].type) { case field_type::SIGNED_INT: { int64_t si; io::read_packed_binary(stream_, si); - if (schema_[i].name == name) + if ((*schema_)[i].name == name) return {field{si}}; break; } @@ -95,7 +95,7 @@ class metadata { uint64_t ui; io::read_packed_binary(stream_, ui); - if (schema_[i].name == name) + if ((*schema_)[i].name == name) return {field{ui}}; break; } @@ -104,7 +104,7 @@ class metadata { double d; io::read_packed_binary(stream_, d); - if (schema_[i].name == name) + if ((*schema_)[i].name == name) return {field{d}}; break; } @@ -113,7 +113,7 @@ class metadata { std::string s{stream_.input_}; stream_.input_ += s.size() + 1; - if (schema_[i].name == name) + if ((*schema_)[i].name == name) return {field{std::move(s)}}; break; } @@ -307,8 +307,8 @@ class metadata const char* input_; }; - /// reference to the metadata_file's schema - const schema& schema_; + /// pointer to the metadata_file's schema + const schema* schema_; /// the fake input stream used for read_packed_binary metadata_input_stream stream_; diff --git a/src/test/tools/CMakeLists.txt b/src/test/tools/CMakeLists.txt index 6a0ae6698..47f226799 100644 --- a/src/test/tools/CMakeLists.txt +++ b/src/test/tools/CMakeLists.txt @@ -1,7 +1,7 @@ ExternalProject_Add(ceeaus SOURCE_DIR ${meta_BINARY_DIR}/../../data/ceeaus DOWNLOAD_DIR ${meta_BINARY_DIR}/downloads - URL http://web.engr.illinois.edu/~massung1/files/ceeaus.tar.gz + URL http://web.engr.illinois.edu/~massung1/files/ceeaus-metadata.tar.gz URL_HASH "SHA256=8ea40b32f34e9ae8aedffe562ad468fc465d1cc0ff6a5c3bdf0ee42bb85c231e" CONFIGURE_COMMAND "" BUILD_COMMAND "" From 53cc73ca6a2494e180576a0acf266ac68cce7d66 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 15 Apr 2015 22:12:39 -0500 Subject: [PATCH 089/481] Switch back to using make for travis. --- .travis.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8d8c433f0..dcfc465d6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,6 @@ addons: packages: - g++-4.8 - libicu-dev - - ninja-build install: - mkdir $HOME/lib @@ -56,6 +55,6 @@ before_script: - cp ../config.toml ./ script: - - cmake ../ -DCMAKE_BUILD_TYPE=Debug -G Ninja - - ninja + - cmake ../ -DCMAKE_BUILD_TYPE=Debug + - make - ctest --output-on-failure From 46742ae75308973c7d8c598fdf7b2bcd2620b0b4 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 20 Apr 2015 22:01:28 -0500 Subject: [PATCH 090/481] Add support for reading postings lists in a streaming manner. This also consolidates our packed binary io operations into its own namespace, using VarInt encoding. --- include/corpus/metadata.h | 10 +- include/index/forward_index.h | 9 ++ include/index/inverted_index.h | 7 + include/index/postings_data.h | 8 + include/index/postings_data.tcc | 83 +++++++---- include/index/postings_file.h | 36 ++++- include/index/postings_file_writer.h | 46 +++++- include/index/postings_stream.h | 209 +++++++++++++++++++++++++++ include/io/binary.h | 116 --------------- include/io/packed.h | 165 +++++++++++++++++++++ src/index/forward_index.cpp | 18 +-- src/index/inverted_index.cpp | 6 + src/index/metadata_file.cpp | 4 +- src/index/metadata_writer.cpp | 13 +- 14 files changed, 544 insertions(+), 186 deletions(-) create mode 100644 include/index/postings_stream.h create mode 100644 include/io/packed.h diff --git a/include/corpus/metadata.h b/include/corpus/metadata.h index 5000bcfe8..fbda1cfbb 100644 --- a/include/corpus/metadata.h +++ b/include/corpus/metadata.h @@ -15,7 +15,7 @@ #include #include "cpptoml.h" -#include "io/binary.h" +#include "io/packed.h" #include "util/optional.h" namespace meta @@ -85,7 +85,7 @@ class metadata case field_type::SIGNED_INT: { int64_t si; - io::read_packed_binary(stream_, si); + io::packed::read(stream_, si); if ((*schema_)[i].name == name) return {field{si}}; break; @@ -94,7 +94,7 @@ class metadata case field_type::UNSIGNED_INT: { uint64_t ui; - io::read_packed_binary(stream_, ui); + io::packed::read(stream_, ui); if ((*schema_)[i].name == name) return {field{ui}}; break; @@ -103,7 +103,7 @@ class metadata case field_type::DOUBLE: { double d; - io::read_packed_binary(stream_, d); + io::packed::read(stream_, d); if ((*schema_)[i].name == name) return {field{d}}; break; @@ -310,7 +310,7 @@ class metadata /// pointer to the metadata_file's schema const schema* schema_; - /// the fake input stream used for read_packed_binary + /// the fake input stream used for io::packed::read metadata_input_stream stream_; }; diff --git a/include/index/forward_index.h b/include/index/forward_index.h index 80698ebde..548fb3e0f 100644 --- a/include/index/forward_index.h +++ b/include/index/forward_index.h @@ -14,7 +14,9 @@ #include "index/disk_index.h" #include "index/make_index.h" +#include "index/postings_stream.h" #include "util/disk_vector.h" +#include "util/optional.h" #include "meta.h" namespace meta @@ -117,6 +119,13 @@ class forward_index : public disk_index virtual std::shared_ptr search_primary(doc_id d_id) const; + /** + * @param d_id The doc_id to search for + * @return the postings stream for a given doc_id + */ + util::optional> + stream_for(doc_id d_id) const; + /** * @param d_id The document id of the doc to convert to liblinear format * @return the string representation liblinear format diff --git a/include/index/inverted_index.h b/include/index/inverted_index.h index 0080ada23..6767b32e8 100644 --- a/include/index/inverted_index.h +++ b/include/index/inverted_index.h @@ -16,6 +16,7 @@ #include "index/disk_index.h" #include "index/make_index.h" +#include "index/postings_stream.h" namespace meta { @@ -129,6 +130,12 @@ class inverted_index : public disk_index virtual std::shared_ptr search_primary(term_id t_id) const; + /** + * @param t_id The trem_id to search for + * @return the postings stream for a given term_id + */ + util::optional> stream_for(term_id t_id) const; + /** * @param t_id The term to search for * @return the document frequency of a term (number of documents it diff --git a/include/index/postings_data.h b/include/index/postings_data.h index afa094cbd..0e19a5678 100644 --- a/include/index/postings_data.h +++ b/include/index/postings_data.h @@ -113,6 +113,14 @@ class postings_data */ void set_counts(const count_t& counts); + /** + * @param begin The beginning of the counts to assign into this + * postings_data + * @param end The end of the counts to assign into this postings_data + */ + template + void set_counts(InputIterator begin, InputIterator end); + /** * @param other The postings_data to compare with * @return whether this postings_data is less than (has a smaller diff --git a/include/index/postings_data.tcc b/include/index/postings_data.tcc index 75e629df1..a42e7f8f1 100644 --- a/include/index/postings_data.tcc +++ b/include/index/postings_data.tcc @@ -5,6 +5,7 @@ #include #include +#include #include "index/postings_data.h" namespace meta @@ -15,13 +16,14 @@ namespace index template postings_data::postings_data(PrimaryKey p_id) : p_id_{p_id} -{/* nothing */ +{ /* nothing */ } template void postings_data::merge_with(postings_data& other) { - auto searcher = [](const pair_t& p, const SecondaryKey& s) { + auto searcher = [](const pair_t& p, const SecondaryKey& s) + { return p.first < s; }; @@ -43,16 +45,16 @@ void postings_data::merge_with(postings_data& other) if (counts_.size() > orig_length) { std::sort(counts_.begin(), counts_.end(), - [](const pair_t& a, const pair_t& b) { - return a.first < b.first; - } - ); + [](const pair_t& a, const pair_t& b) + { + return a.first < b.first; + }); } } template -void postings_data - ::increase_count(SecondaryKey s_id, double amount) +void postings_data::increase_count(SecondaryKey s_id, + double amount) { counts_[s_id] += amount; } @@ -64,8 +66,8 @@ double postings_data::count(SecondaryKey s_id) const } template -const std::vector>& postings_data - ::counts() const +const std::vector>& + postings_data::counts() const { return counts_.contents(); } @@ -78,15 +80,25 @@ void postings_data::set_counts(const count_t& counts) } template -void postings_data - ::set_primary_key(PrimaryKey new_key) +template +void postings_data::set_counts(InputIterator begin, + InputIterator end) +{ + for (; begin != end; ++begin) + counts_.emplace_back(*begin); + counts_.shrink_to_fit(); +} + +template +void postings_data::set_primary_key( + PrimaryKey new_key) { p_id_ = new_key; } template -bool postings_data::operator<(const postings_data - & other) const +bool postings_data:: + operator<(const postings_data& other) const { return primary_key() < other.primary_key(); } @@ -106,10 +118,16 @@ PrimaryKey postings_data::primary_key() const template template -void postings_data - ::write_compressed(io::compressed_file_writer - & writer) const -{ +void postings_data::write_compressed( + io::compressed_file_writer& writer) const +{ + writer.write(counts_.size()); + writer.write(std::accumulate(counts_.begin(), counts_.end(), uint64_t{0}, + [](uint64_t cur, const pair_t& pr) + { + return cur + + static_cast(pr.second); + })); count_t mutable_counts{counts_.contents()}; writer.write(mutable_counts[0].first); if (std::is_same::value) @@ -139,27 +157,26 @@ void postings_data writer.write(mutable_counts[i].second); } } - - // mark end of postings_data - writer.write(delimiter_); } template template void postings_data::read_compressed( - io::compressed_file_reader& reader) + io::compressed_file_reader& reader) { + uint64_t size = reader.next(); + + // ignore total counts sum + reader.next(); + counts_.clear(); + counts_.reserve(size); + uint64_t last_id = 0; - while (true) + for (uint64_t i = 0; i < size; ++i) { uint64_t this_id = reader.next(); - - // have we reached a delimiter? - if (this_id == delimiter_) - break; - // we're using gap encoding last_id += this_id; SecondaryKey key{last_id}; @@ -186,15 +203,17 @@ void postings_data::read_compressed( namespace { template -uint64_t length(const T& elem, typename std::enable_if< - std::is_same::value>::type* = nullptr) +uint64_t length(const T& elem, + typename std::enable_if::value>:: + type* = nullptr) { return elem.size(); } template -uint64_t length(const T& elem, typename std::enable_if< - !std::is_same::value>::type* = nullptr) +uint64_t length(const T& elem, + typename std::enable_if::value>:: + type* = nullptr) { return sizeof(elem); } diff --git a/include/index/postings_file.h b/include/index/postings_file.h index f2d1bb42c..284af2eac 100644 --- a/include/index/postings_file.h +++ b/include/index/postings_file.h @@ -11,14 +11,21 @@ #define META_INDEX_POSTINGS_FILE_H_ #include "index/postings_data.h" +#include "index/postings_stream.h" #include "io/mmap_file.h" #include "util/disk_vector.h" +#include "util/optional.h" namespace meta { namespace index { +/** + * File that stores the postings list for an index on disk. Each postings + * list is indexed via PrimaryKey and consists of pairs of (SecondaryKey, + * double). + */ template class postings_file { @@ -30,11 +37,27 @@ class postings_file * @param filename The path to the file */ postings_file(const std::string& filename) - : postings_{filename}, bit_locations_{filename + "_index"} + : postings_{filename}, byte_locations_{filename + "_index"} { // nothing } + /** + * Obtains a postings stream object for the given primary key. + * @param pk The primary key to look up + * @return a postings stream for this primary key, if it is in the + * postings file + */ + template + util::optional> + find_stream(PrimaryKey pk) const + { + if (pk < byte_locations_.size()) + return postings_stream{ + postings_, byte_locations_.at(pk)}; + return util::nullopt; + } + /** * Obtains a postings data object for the given primary key. * @param pk The primary key to look up @@ -48,13 +71,10 @@ class postings_file uint64_t idx{pk}; // if we are in-bounds of the postings file, populate counts - if (idx < bit_locations_.size()) + if (idx < byte_locations_.size()) { - io::compressed_file_reader reader{ - postings_, io::default_compression_reader_func}; - reader.seek(bit_locations_.at(idx)); - - pdata->template read_compressed(reader); + auto stream = find_stream(pk); + pdata->set_counts(stream->begin(), stream->end()); } return pdata; @@ -62,7 +82,7 @@ class postings_file private: io::mmap_file postings_; - util::disk_vector bit_locations_; + util::disk_vector byte_locations_; }; } } diff --git a/include/index/postings_file_writer.h b/include/index/postings_file_writer.h index 1af16d5eb..066560ba5 100644 --- a/include/index/postings_file_writer.h +++ b/include/index/postings_file_writer.h @@ -10,7 +10,9 @@ #ifndef META_INDEX_POSTINGS_FILE_WRITER_H_ #define META_INDEX_POSTINGS_FILE_WRITER_H_ -#include "io/compressed_file_writer.h" +#include +#include +#include "io/packed.h" #include "util/disk_vector.h" namespace meta @@ -26,8 +28,9 @@ class postings_file_writer * @param filename The filename (prefix) for the postings file. */ postings_file_writer(const std::string& filename, uint64_t unique_keys) - : output_{filename, io::default_compression_writer_func}, - bit_locations_{filename + "_index", unique_keys}, + : output_{filename, std::ios::binary}, + byte_locations_{filename + "_index", unique_keys}, + byte_pos_{0}, id_{0} { // nothing @@ -40,14 +43,41 @@ class postings_file_writer template void write(const PostingsData& pdata) { - bit_locations_[id_] = output_.bit_location(); - pdata.template write_compressed(output_); + byte_locations_[id_] = byte_pos_; + byte_pos_ += io::packed::write(output_, pdata.counts().size()); + + auto total_counts = std::accumulate( + pdata.counts().begin(), pdata.counts().end(), uint64_t{0}, + [](uint64_t cur, const typename PostingsData::pair_t& pr) + { + return cur + static_cast(pr.second); + }); + byte_pos_ += io::packed::write(output_, total_counts); + + uint64_t last_id = 0; + for (const auto& count : pdata.counts()) + { + byte_pos_ += io::packed::write(output_, count.first - last_id); + + if (std::is_same::value) + { + byte_pos_ += io::packed::write( + output_, static_cast(count.second)); + } + else + { + byte_pos_ += io::packed::write(output_, count.second); + } + + last_id = count.first; + } ++id_; - } + } private: - io::compressed_file_writer output_; - util::disk_vector bit_locations_; + std::ofstream output_; + util::disk_vector byte_locations_; + uint64_t byte_pos_; uint64_t id_; }; } diff --git a/include/index/postings_stream.h b/include/index/postings_stream.h new file mode 100644 index 000000000..9f3c68620 --- /dev/null +++ b/include/index/postings_stream.h @@ -0,0 +1,209 @@ +/** + * @file postings_stream.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_INDEX_POSTINGS_STREAM_H_ +#define META_INDEX_POSTINGS_STREAM_H_ + +#include +#include + +#include "io/mmap_file.h" +#include "io/compressed_file_reader.h" +#include "util/optional.h" +#include "io/packed.h" + +namespace meta +{ +namespace index +{ + +/** + * A stream for extracting the postings list for a specific key in a + * postings file. This can be used instead of postings_data to avoid + * reading in the entire postings list into memory at once. + */ +template +class postings_stream +{ + private: + struct char_input_stream + { + char_input_stream(const char* input) : input_{input} + { + // nothing + } + + char get() + { + return *input_++; + } + + const char* input_; + }; + + public: + /** + * Creates a postings stream reading from the given file at the given + * byte position. + * + * @param file The file that contains the postings lists + * @param seek_pos The position in the file to begin reading from + */ + postings_stream(const io::mmap_file& file, uint64_t seek_pos) + : file_{&file}, seek_pos_{seek_pos} + { + char_input_stream stream{file_->begin() + seek_pos_}; + + io::packed::read(stream, size_); + io::packed::read(stream, total_counts_); + } + + /** + * @return the number of SecondaryKeys in this postings list. + */ + uint64_t size() const + { + return size_; + } + + /** + * @return the total sum of the counts for SecondaryKeys in this + * postings list. + */ + uint64_t total_counts() const + { + return total_counts_; + } + + /** + * An iterator over the (SecondaryKey, double) pairs of this postings + * list. + */ + class iterator + { + public: + using value_type = std::pair; + using reference = const value_type&; + using pointer = const value_type*; + using iterator_category = std::input_iterator_tag; + using difference_type = std::ptrdiff_t; + + friend postings_stream; + + iterator() : size_{0}, pos_{0} + { + // nothing + } + + iterator& operator++() + { + if (stor_) + { + if (pos_ == size_) + { + stor_ = util::nullopt; + pos_ = 0; + size_ = 0; + } + else + { + uint64_t id; + io::packed::read(*stream_, id); + // gap encoding + stor_->first += id; + + if (std::is_same::value) + { + uint64_t next; + io::packed::read(*stream_, next); + stor_->second = static_cast(next); + } + else + { + io::packed::read(*stream_, stor_->second); + } + ++pos_; + } + } + return *this; + } + + util::optional> operator++(int) + { + auto proxy = *(*this); + ++(*this); + return proxy; + } + + reference operator*() const + { + return *stor_; + } + + pointer operator->() const + { + return &(*stor_); + } + + bool operator==(const iterator& other) + { + return std::tie(stor_, size_, pos_) + == std::tie(other.stor_, other.size_, other.pos_); + } + + bool operator!=(const iterator& other) + { + return !(*this == other); + } + + private: + iterator(const io::mmap_file& file, uint64_t seek_pos) + : stream_{file.begin() + seek_pos}, + pos_{0}, + stor_{std::make_pair(SecondaryKey{0}, 0.0)} + { + io::packed::read(*stream_, size_); + + // ignore total counts + uint64_t total_counts; + io::packed::read(*stream_, total_counts); + ++(*this); + } + + util::optional stream_; + uint64_t size_; + uint64_t pos_; + util::optional> stor_; + }; + + /** + * @return an iterator to the beginning of the list + */ + iterator begin() const + { + return {*file_, seek_pos_}; + } + + /** + * @return an iterator to the ending of the list + */ + iterator end() const + { + return {}; + } + + private: + const io::mmap_file* file_; + uint64_t seek_pos_; + uint64_t size_; + uint64_t total_counts_; +}; +} +} +#endif diff --git a/include/io/binary.h b/include/io/binary.h index cf0e3b340..9b40cfd42 100644 --- a/include/io/binary.h +++ b/include/io/binary.h @@ -65,122 +65,6 @@ inline void read_binary(std::istream& in, std::string& str) { std::getline(in, str, '\0'); } - -/** - * Writes an integral type in a packed representation. The first byte is a - * flag byte used to indicate two things: the first bit indicates the sign - * of the number, and then the lowest four bits indicates the length (in - * bytes) of the unsigned number that follows. - * - * @see http://dlib.net/dlib/serialize.h.html - * @param out The stream to write to - * @param elem The integral type to write in packed format - * @return the number of bytes used to write out elem - */ -template -uint64_t write_packed_binary(std::ostream& out, T elem) -{ - static_assert(std::is_integral::value, - "packed binary requires integers"); - - std::array buffer; - if (elem < 0) - { - elem *= -1; - buffer[0] = 0x80; - } - else - { - buffer[0] = 0; - } - - uint8_t idx = 1; - for (; idx <= sizeof(T) && elem > 0; ++idx) - { - buffer[idx] = static_cast(elem & 0xFF); - elem >>= 8; - } - buffer[0] |= (idx - 1); - out.write(reinterpret_cast(&buffer[0]), idx); - return idx; -} - -/** - * Writes a double in a packed integer binary representation. This splits - * the double into its mantissa and exponent such that - * mantissa * std::pow(2.0, exponent) == elem. The mantissa and exponent - * are integers are are written using the integer packed format. - * - * @see - *http://stackoverflow.com/questions/5672960/how-can-i-extract-the-mantissa-of-a-double - * @see http://dlib.net/dlib/float_details.h.html - * @param out The stream to write to - * @param elem The double to write in packed format - * @return the number of bytes used to write out elem - */ -inline uint64_t write_packed_binary(std::ostream& out, double elem) -{ - int exp; - auto digits = std::numeric_limits::digits; - auto mantissa - = static_cast(std::frexp(elem, &exp) * (1ul << digits)); - int16_t exponent = exp - digits; - - // see dlib link above; tries to shrink mantissa for more efficient - // serialization - for (uint8_t i = 0; i < sizeof(mantissa) && (mantissa & 0xFF) == 0; ++i) - { - mantissa >>= 8; - exponent += 8; - } - - auto bytes = write_packed_binary(out, mantissa); - bytes += write_packed_binary(out, exponent); - return bytes; -} - -/** - * Reads an integer from its packed binary representation. - * @param in The stream to read from - * @param elem The element to write into - */ -template -void read_packed_binary(InputStream& in, T& elem) -{ - static_assert(std::is_integral::value, - "packed binary requires integers"); - - auto flag_byte = static_cast(in.get()); - auto size = flag_byte & 0x0F; - - elem = 0; - for (uint8_t idx = 0; idx < size; ++idx) - { - uint64_t byte = static_cast(in.get()); - byte <<= 8 * idx; - elem |= byte; - } - - if (std::is_signed::value && (flag_byte & 0x80) > 0) - { - elem *= -1; - } -} - -/** - * Reads a double from its packed binary representation. - * @param in The stream to read from - * @param elem The element to write into - */ -template -void read_packed_binary(InputStream& in, double& elem) -{ - int64_t mantissa; - int16_t exponent; - read_packed_binary(in, mantissa); - read_packed_binary(in, exponent); - elem = mantissa * std::pow(2.0, exponent); -} } } #endif diff --git a/include/io/packed.h b/include/io/packed.h new file mode 100644 index 000000000..defc84ad1 --- /dev/null +++ b/include/io/packed.h @@ -0,0 +1,165 @@ +/** + * @file packed.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_IO_PACKED_H_ +#define META_IO_PACKED_H_ + +#include +#include +#include + +namespace meta +{ +namespace io +{ +namespace packed +{ + +/** + * Writes an unsigned integer in a packed representation. Each byte has + * seven bits of data, with the MSB reserved for a flag indicating whether + * to continue reading bytes. + * + * @param stream The stream to write to + * @param value The value to write + * @return the number of bytes used to write out the value + */ +template +uint64_t write(OutputStream& stream, uint64_t value) +{ + uint64_t size = 0; + while (value > 127) + { + ++size; + stream.put((value & 127) | 128); + value >>= 7; + } + stream.put(value); + return size + 1; +} + +/** + * Writes a signed integer in a packed representation. This uses the same + * representation as for unsigned integers by first converting the signed + * integer into an unsigned one using zig-zag encoding. + * + * @see https://developers.google.com/protocol-buffers/docs/encoding#types + * + * @param stream The stream to write to + * @param value The value to write + * @return the number of bytes used to write out the value + */ +template +uint64_t write(OutputStream& stream, int64_t value) +{ + uint64_t elem = (value << 1) ^ (value >> 63); + return write(stream, elem); +} + +/** + * Writes a double in a packed representation. This splits + * the double into its mantissa and exponent such that + * mantissa * std::pow(2.0, exponent) == elem. The mantissa and exponent + * are integers are are written using the integer packed format. + * + * @see http://stackoverflow.com/questions/5672960/how-can-i-extract-the-mantissa-of-a-double + * @see http://dlib.net/dlib/float_details.h.html + * + * @param stream The stream to write to + * @param value The value to write + * @return the number of bytes used to write out the value + */ +template +uint64_t write(OutputStream& stream, double value) +{ + int exp; + auto digits = std::numeric_limits::digits; + auto mantissa + = static_cast(std::frexp(value, &exp) * (1ul << digits)); + int64_t exponent = exp - digits; + + // see dlib link above; tries to shrink mantissa for more efficient + // serialization + for (uint8_t i = 0; i < sizeof(mantissa) && (mantissa & 0xFF) == 0; ++i) + { + mantissa >>= 8; + exponent += 8; + } + + auto bytes = write(stream, mantissa); + bytes += write(stream, exponent); + return bytes; +} + +/** + * Reads an unsigned integer from its packed representation. + * + * @param stream The stream to read from + * @param value The element to write into + * @return the number of bytes read + */ +template +uint64_t read(InputStream& stream, uint64_t& value) +{ + value = 0; + uint64_t size = 0; + uint8_t byte; + do + { + byte = stream.get(); + value |= static_cast(byte & 127) << (7 * size); + ++size; + } while (byte & 128); + return size; +} + +/** + * Reads a signed integer from its packed representation. This does the + * reverse of zig-zag encoding to convert from a unsigned integer to a + * signed integer after reading from the file. + * + * @see http://stackoverflow.com/questions/2210923/zig-zag-decoding + * + * @param stream The stream to read from + * @param value The element to write into + * @return the number of bytes read + */ +template +uint64_t read(InputStream& stream, int64_t& value) +{ + uint64_t elem; + auto bytes = read(stream, elem); + + value = (elem >> 1) ^ (-(elem & 1)); + + return bytes; +} + +/** + * Reads a double from its packed representation. + * + * @param stream The stream to read from + * @param value The element to write into + * @return the number of bytes read + */ +template +uint64_t read(InputStream& stream, double& value) +{ + int64_t mantissa; + int64_t exponent; + + auto bytes = read(stream, mantissa); + bytes += read(stream, exponent); + value = mantissa * std::pow(2.0, exponent); + return bytes; +} +} +} +} +#endif diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 3ab5bf9e0..1bd0612d6 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -304,22 +304,22 @@ auto forward_index::search_primary(doc_id d_id) const return fwd_impl_->postings_->find(d_id); } -void forward_index::impl::uninvert(const inverted_index& inv_idx) +util::optional> + forward_index::stream_for(doc_id d_id) const { - io::compressed_file_reader inv_reader{inv_idx.index_name() - + idx_->impl_->files[POSTINGS], - io::default_compression_reader_func}; + return fwd_impl_->postings_->find_stream(d_id); +} +void forward_index::impl::uninvert(const inverted_index& inv_idx) +{ term_id t_id{0}; chunk_handler handler{idx_->index_name()}; { auto producer = handler.make_producer(); - while (inv_reader.has_next()) + for (term_id t_id{0}; t_id < inv_idx.unique_terms(); ++t_id) { - inverted_pdata_type pdata{t_id}; - pdata.read_compressed(inv_reader); - producer(pdata.primary_key(), pdata.counts()); - ++t_id; + auto pdata = inv_idx.search_primary(t_id); + producer(pdata->primary_key(), pdata->counts()); } } diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 6889d5b36..d43ea283e 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -307,5 +307,11 @@ auto inverted_index::search_primary(term_id t_id) const { return inv_impl_->postings_->find(t_id); } + +util::optional> + inverted_index::stream_for(term_id t_id) const +{ + return inv_impl_->postings_->find_stream(t_id); +} } } diff --git a/src/index/metadata_file.cpp b/src/index/metadata_file.cpp index f7f5858dc..c80abd4e2 100644 --- a/src/index/metadata_file.cpp +++ b/src/index/metadata_file.cpp @@ -4,7 +4,7 @@ */ #include "index/metadata_file.h" -#include "io/binary.h" +#include "io/packed.h" namespace meta { @@ -41,7 +41,7 @@ metadata_file::metadata_file(const std::string& prefix) // read in the header to populate the schema char_input_stream stream{md_db_.begin(), md_db_.begin() + md_db_.size()}; uint64_t num_fields; - io::read_packed_binary(stream, num_fields); + io::packed::read(stream, num_fields); schema_.reserve(num_fields); for (uint64_t i = 0; i < num_fields; ++i) diff --git a/src/index/metadata_writer.cpp b/src/index/metadata_writer.cpp index af4d86475..44e0f1d19 100644 --- a/src/index/metadata_writer.cpp +++ b/src/index/metadata_writer.cpp @@ -5,6 +5,7 @@ #include "index/metadata_writer.h" #include "io/binary.h" +#include "io/packed.h" namespace meta { @@ -19,7 +20,7 @@ metadata_writer::metadata_writer(const std::string& prefix, uint64_t num_docs, schema_{std::move(schema)} { // write metadata header - byte_pos_ += io::write_packed_binary(db_file_, schema_.size() + 2); + byte_pos_ += io::packed::write(db_file_, schema_.size() + 2); byte_pos_ += io::write_binary(db_file_, std::string{"length"}); byte_pos_ += io::write_binary(db_file_, corpus::metadata::field_type::UNSIGNED_INT); @@ -41,8 +42,8 @@ void metadata_writer::write(doc_id d_id, uint64_t length, uint64_t num_unique, seek_pos_[d_id] = byte_pos_; // write "mandatory" metadata - byte_pos_ += io::write_packed_binary(db_file_, length); - byte_pos_ += io::write_packed_binary(db_file_, num_unique); + byte_pos_ += io::packed::write(db_file_, length); + byte_pos_ += io::packed::write(db_file_, num_unique); // write optional metadata if (mdata.size() != schema_.size()) @@ -54,15 +55,15 @@ void metadata_writer::write(doc_id d_id, uint64_t length, uint64_t num_unique, switch (fld.type) { case corpus::metadata::field_type::SIGNED_INT: - byte_pos_ += io::write_packed_binary(db_file_, fld.sign_int); + byte_pos_ += io::packed::write(db_file_, fld.sign_int); break; case corpus::metadata::field_type::UNSIGNED_INT: - byte_pos_ += io::write_packed_binary(db_file_, fld.usign_int); + byte_pos_ += io::packed::write(db_file_, fld.usign_int); break; case corpus::metadata::field_type::DOUBLE: - byte_pos_ += io::write_packed_binary(db_file_, fld.doub); + byte_pos_ += io::packed::write(db_file_, fld.doub); break; case corpus::metadata::field_type::STRING: From b42d0a5b5e085b8170ded6599c473495403947c4 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 20 Apr 2015 23:43:51 -0500 Subject: [PATCH 091/481] Only set libcxx options for cmake system tests if present. --- CMakeLists.txt | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 53954a773..bdc5a05a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,9 +109,16 @@ if(UNIX) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${LIBDL_LIBRARY}") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LIBCXX_OPTIONS}") - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${LIBCXX_OPTIONS} ${CXXABI_LIBRARY} -L${LIBCXX_LIB_PATH}") - set(CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES} ${LIBCXX_INCLUDE_DIR}") + if(LIBCXX_OPTIONS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LIBCXX_OPTIONS}") + endif() + + if(CXXABI_LIBRARY) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${LIBCXX_OPTIONS} ${CXXABI_LIBRARY} -L${LIBCXX_LIB_PATH}") + endif() + if(LIBCXX_INCLUDE_DIR) + set(CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES} ${LIBCXX_INCLUDE_DIR}") + endif() endif() add_library(meta-definitions INTERFACE) From b8600b68c3ab62b746b3d97a8ccf736240584526 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 21 Apr 2015 00:06:57 -0500 Subject: [PATCH 092/481] Switch ranker to use a document-at-a-time scheme. This uses the postings_stream objects for memory efficiency during ranking. --- include/index/ranker/ranker.h | 15 ++- src/index/ranker/ranker.cpp | 172 ++++++++++++++++++++++------------ 2 files changed, 119 insertions(+), 68 deletions(-) diff --git a/include/index/ranker/ranker.h b/include/index/ranker/ranker.h index 94d93f553..bde75b6a5 100644 --- a/include/index/ranker/ranker.h +++ b/include/index/ranker/ranker.h @@ -49,11 +49,12 @@ class ranker * if the document should be included in results */ std::vector> - score(inverted_index& idx, corpus::document& query, - uint64_t num_results = 10, - const std::function& filter = [](doc_id) { - return true; - }); + score(inverted_index& idx, corpus::document& query, + uint64_t num_results = 10, + const std::function& filter = [](doc_id) + { + return true; + }); /** * Computes the contribution to the score of a document for a matched @@ -73,10 +74,6 @@ class ranker * Default destructor. */ virtual ~ranker() = default; - - private: - /// results per doc_id - std::vector results_; }; } } diff --git a/src/index/ranker/ranker.cpp b/src/index/ranker/ranker.cpp index 9eb5bdd17..12811fe17 100644 --- a/src/index/ranker/ranker.cpp +++ b/src/index/ranker/ranker.cpp @@ -1,6 +1,7 @@ /** * @file ranker.cpp * @author Sean Massung + * @author Chase Geigle */ #include @@ -15,89 +16,142 @@ namespace meta namespace index { -std::vector> -ranker::score(inverted_index& idx, corpus::document& query, - uint64_t num_results /* = 10 */, - const std::function& filter /* return true */) +namespace +{ + +struct postings_context +{ + using postings_data_type = inverted_index::postings_data_type; + using iterator = postings_stream::iterator; + + postings_stream stream; + iterator begin; + iterator end; + term_id t_id; + uint64_t query_term_count; + uint64_t doc_count; + uint64_t corpus_term_count; + + postings_context(postings_stream strm, uint64_t qtf, term_id term) + : stream{std::move(strm)}, + begin{stream.begin()}, + end{stream.end()}, + t_id{term}, + query_term_count{qtf}, + doc_count{stream.size()}, + corpus_term_count{stream.total_counts()} + { + // nothing + } +}; +} + +std::vector> ranker::score( + inverted_index& idx, corpus::document& query, + uint64_t num_results /* = 10 */, + const std::function& filter /* return true */) { if (query.counts().empty()) idx.tokenize(query); - score_data sd{idx, idx.avg_doc_length(), - idx.num_docs(), idx.total_corpus_terms(), - query}; + score_data sd{idx, idx.avg_doc_length(), idx.num_docs(), + idx.total_corpus_terms(), query}; - // a map from doc_id -> (length, unique_terms) to prevent looking up - // metadata repeatedly in the ranking loop - std::unordered_map> md_map; + std::vector> results; + results.reserve(num_results + 1); // +1 since we use this as a heap and + // prune when it exceeds size num_results + auto comp = [](const std::pair& a, + const std::pair& b) + { + // comparison is reversed since we want a min-heap + return a.second > b.second; + }; - // zeros out elements and (if necessary) resizes the vector; this eliminates - // constructing a new vector each query for the same index - results_.assign(sd.num_docs, std::numeric_limits::lowest()); + std::vector postings; + postings.reserve(query.counts().size()); - for (auto& tpair : query.counts()) + doc_id cur_doc{idx.num_docs()}; + for (const auto& count : query.counts()) { - term_id t_id{idx.get_term_id(tpair.first)}; - auto pdata = idx.search_primary(t_id); - sd.doc_count = pdata->counts().size(); - sd.t_id = t_id; - sd.query_term_count = tpair.second; - sd.corpus_term_count = idx.total_num_occurences(sd.t_id); - for (auto& dpair : pdata->counts()) - { - sd.d_id = dpair.first; - sd.doc_term_count = dpair.second; + auto term = idx.get_term_id(count.first); + auto pstream = idx.stream_for(term); + if (!pstream) + continue; - auto& md = md_map[dpair.first]; - if (md.first == 0) - { - md.first = idx.doc_size(dpair.first); - md.second = idx.unique_terms(dpair.first); - } - sd.doc_size = md.first; - sd.doc_unique_terms = md.second; + postings.emplace_back(*pstream, count.second, term); - // if this is the first time we've seen this document, compute - // its initial score - if (results_[dpair.first] == std::numeric_limits::lowest()) - results_[dpair.first] = initial_score(sd); + while (postings.back().begin != postings.back().end + && !filter(postings.back().begin->first)) + ++postings.back().begin; - results_[dpair.first] += score_one(sd); + if (postings.back().begin != postings.back().end) + { + if (postings.back().begin->first < cur_doc) + cur_doc = postings.back().begin->first; } } - using doc_pair = std::pair; - auto doc_pair_comp = [](const doc_pair& a, const doc_pair& b) - { return a.second > b.second; }; - - std::priority_queue, - decltype(doc_pair_comp)> pq{doc_pair_comp}; - for (uint64_t id = 0; id < results_.size(); ++id) + doc_id next_doc{idx.num_docs()}; + while (cur_doc < idx.num_docs()) { - if (!filter(doc_id{id})) - continue; + sd.d_id = cur_doc; + sd.doc_size = idx.doc_size(cur_doc); + sd.doc_unique_terms = idx.unique_terms(cur_doc); - pq.emplace(doc_id{id}, results_[id]); - if (pq.size() > num_results) - pq.pop(); - } + double score = initial_score(sd); + for (auto& pc : postings) + { + if (pc.begin == pc.end || pc.begin->first != cur_doc) + continue; - std::vector sorted; - while (!pq.empty()) - { - sorted.emplace_back(pq.top()); - pq.pop(); + // set up this term + sd.t_id = pc.t_id; + sd.query_term_count = pc.query_term_count; + sd.doc_count = pc.doc_count; + sd.corpus_term_count = pc.corpus_term_count; + sd.doc_term_count = pc.begin->second; + + score += score_one(sd); + + // advance over this position in the current postings context + // until the next valid document + do + { + ++pc.begin; + } while (pc.begin != pc.end && !filter(pc.begin->first)); + + if (pc.begin != pc.end) + { + // check if the document in the next position is the + // smallest accepted doc_id + if (pc.begin->first < next_doc) + next_doc = pc.begin->first; + } + } + + // add doc to the heap and poll if needed + results.emplace_back(cur_doc, score); + std::push_heap(results.begin(), results.end(), comp); + if (results.size() > num_results) + { + std::pop_heap(results.begin(), results.end(), comp); + results.pop_back(); + } + + cur_doc = next_doc; + next_doc = doc_id{idx.num_docs()}; } - std::reverse(sorted.begin(), sorted.end()); - return sorted; + // heap sort the values + for (auto end = results.end(); end != results.begin(); --end) + std::pop_heap(results.begin(), end, comp); + + return results; } double ranker::initial_score(const score_data&) const { return 0.0; } - } } From 499a2a11f97e09c70cc4908a6fe21a906aa56ca0 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 21 Apr 2015 00:07:39 -0500 Subject: [PATCH 093/481] Only request the top k in knn classifiers. It used to request a score for every document in the collection, but only ever used the top k in that list. --- src/classify/classifier/knn.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/classify/classifier/knn.cpp b/src/classify/classifier/knn.cpp index 6553e3de5..db926e019 100644 --- a/src/classify/classifier/knn.cpp +++ b/src/classify/classifier/knn.cpp @@ -46,14 +46,13 @@ class_label knn::classify(doc_id d_id) for (const auto& count : idx_->search_primary(d_id)->counts()) query.increment(idx_->term_text(count.first), count.second); - auto scored = ranker_->score(*inv_idx_, query, inv_idx_->num_docs(), - [&](doc_id d_id) + auto scored = ranker_->score(*inv_idx_, query, k_, [&](doc_id d_id) { - return legal_docs_.find(d_id) != legal_docs_.end(); - }); + return legal_docs_.find(d_id) + != legal_docs_.end(); + }); std::unordered_map counts; - uint16_t i = 0; for (auto& s : scored) { // normally, weighted k-nn weights neighbors by 1/distance, but since @@ -63,9 +62,6 @@ class_label knn::classify(doc_id d_id) // if not weighted, each neighbor gets an equal vote else ++counts[idx_->label(s.first)]; - - if (++i > k_) - break; } if (counts.empty()) @@ -75,8 +71,8 @@ class_label knn::classify(doc_id d_id) std::vector sorted{counts.begin(), counts.end()}; std::sort(sorted.begin(), sorted.end(), [](const pair_t& a, const pair_t& b) { - return a.second > b.second; - }); + return a.second > b.second; + }); return select_best_label(scored, sorted); } From 3921f281a209e2ba459652fe1f4c72d9c448cdc2 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 21 Apr 2015 00:08:19 -0500 Subject: [PATCH 094/481] Fix unused variable in forward_index. --- src/index/forward_index.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 1bd0612d6..d27636137 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -312,7 +312,6 @@ util::optional> void forward_index::impl::uninvert(const inverted_index& inv_idx) { - term_id t_id{0}; chunk_handler handler{idx_->index_name()}; { auto producer = handler.make_producer(); From 1bea38f98151390f2cf38862683c4aed900e072d Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 21 Apr 2015 01:10:47 -0500 Subject: [PATCH 095/481] Fix potential document-skipping bug in new ranker. We weren't checking the head of the postings_context objects for a lower next document id. This would cause a bug if the next lowest document id was one of these and that document id is not present in any postings context that contains the current document id. --- src/index/ranker/ranker.cpp | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/index/ranker/ranker.cpp b/src/index/ranker/ranker.cpp index 12811fe17..ba976c3b4 100644 --- a/src/index/ranker/ranker.cpp +++ b/src/index/ranker/ranker.cpp @@ -101,24 +101,27 @@ std::vector> ranker::score( double score = initial_score(sd); for (auto& pc : postings) { - if (pc.begin == pc.end || pc.begin->first != cur_doc) + if (pc.begin == pc.end) continue; - // set up this term - sd.t_id = pc.t_id; - sd.query_term_count = pc.query_term_count; - sd.doc_count = pc.doc_count; - sd.corpus_term_count = pc.corpus_term_count; - sd.doc_term_count = pc.begin->second; - - score += score_one(sd); - - // advance over this position in the current postings context - // until the next valid document - do + if (pc.begin->first == cur_doc) { - ++pc.begin; - } while (pc.begin != pc.end && !filter(pc.begin->first)); + // set up this term + sd.t_id = pc.t_id; + sd.query_term_count = pc.query_term_count; + sd.doc_count = pc.doc_count; + sd.corpus_term_count = pc.corpus_term_count; + sd.doc_term_count = pc.begin->second; + + score += score_one(sd); + + // advance over this position in the current postings context + // until the next valid document + do + { + ++pc.begin; + } while (pc.begin != pc.end && !filter(pc.begin->first)); + } if (pc.begin != pc.end) { From ad9cd02ad7068517ac007d4072fc3b1c48616f9e Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 21 Apr 2015 16:16:09 -0500 Subject: [PATCH 096/481] update query-runner - take actual path to query file - conditionally check for relevance judgements and display them if found --- src/index/tools/query-runner.cpp | 78 ++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/src/index/tools/query-runner.cpp b/src/index/tools/query-runner.cpp index 8fa0f892c..2b441d6be 100644 --- a/src/index/tools/query-runner.cpp +++ b/src/index/tools/query-runner.cpp @@ -8,8 +8,10 @@ #include #include "util/time.h" +#include "util/printing.h" #include "corpus/document.h" #include "index/inverted_index.h" +#include "index/eval/ir_eval.h" #include "index/ranker/ranker_factory.h" #include "parser/analyzers/tree_analyzer.h" #include "sequence/analyzers/ngram_pos_analyzer.h" @@ -24,7 +26,7 @@ int main(int argc, char* argv[]) { if (argc != 2) { - std::cerr << "Usage:\t" << argv[0] << " configFile" << std::endl; + std::cerr << "Usage:\t" << argv[0] << " config.toml" << std::endl; return 1; } @@ -49,41 +51,59 @@ int main(int argc, char* argv[]) auto ranker = index::make_ranker(*group); // Get the path to the file containing queries - auto query_path = config.get_as("querypath"); + auto query_path = config.get_as("query-path"); if (!query_path) - throw std::runtime_error{"config file needs a \"querypath\" parameter"}; + throw std::runtime_error{ + "config file needs a \"query-path\" parameter"}; + std::ifstream queries{*query_path}; - std::ifstream queries{*query_path + *config.get_as("dataset") - + "-queries.txt"}; - std::string content; - auto elapsed_seconds = common::time([&]() + std::unique_ptr eval; + try { - size_t i = 1; - while (queries.good() && i <= 500) // only look at first 500 queries - { - std::getline(queries, content); - corpus::document query{doc_id{0}}; - query.content(content); - std::cout << "Ranking query " << i++ << ": " << std::endl; - - // Use the ranker to score the query over the index. By default, the - // ranker returns 10 documents, so we will display the "top 10 of - // 10" docs. - auto ranking = ranker->score(*idx, query); - std::cout << "Showing top 10 of " << ranking.size() << " results." - << std::endl; + eval = make_unique(argv[1]); + } + catch (index::ir_eval::ir_eval_exception& ex) + { + LOG(info) << "Could not find relevance judgements; skipping eval" + << ENDLG; + } - for (size_t i = 0; i < ranking.size() && i < 10; ++i) + std::string content; + auto elapsed_seconds = common::time( + [&]() + { + size_t i = 0; + while (std::getline(queries, content)) { - std::cout << (i + 1) << ". " << idx->doc_name(ranking[i].first) - << " " << ranking[i].second << std::endl; + corpus::document query{doc_id{0}}; + query.content(content); + std::cout << "Query " << ++i << ": " << std::endl; + std::cout << std::string(20, '=') << std::endl; + + // Use the ranker to score the query over the index. + auto ranking = ranker->score(*idx, query); + auto result_num = 1; + for (auto& result : ranking) + { + std::cout << result_num << ". " + << idx->doc_name(result.first) << " " + << result.second << std::endl; + if (result_num++ == 10) + break; + } + if (eval) + eval->print_stats(ranking, query_id{i - 1}); + std::cout << std::endl; } - std::cout << std::endl; - } - }); + }); + if (eval) + { + std::cout << printing::make_bold(" MAP: ") << eval->map() << std::endl; + std::cout << printing::make_bold(" gMAP: ") << eval->gmap() + << std::endl; + std::cout << std::endl; + } std::cout << "Elapsed time: " << elapsed_seconds.count() << "ms" << std::endl; - - return 0; } From 8d40d5af36cbf355730d3eeadb731a98f0663397 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 21 Apr 2015 16:58:00 -0500 Subject: [PATCH 097/481] add query-path parameter in default config file --- config.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/config.toml b/config.toml index fee272331..b052ba5ad 100644 --- a/config.toml +++ b/config.toml @@ -6,6 +6,7 @@ punctuation = "../data/sentence-boundaries/sentence-punctuation.txt" start-exceptions = "../data/sentence-boundaries/sentence-start-exceptions.txt" end-exceptions = "../data/sentence-boundaries/sentence-end-exceptions.txt" query-judgements = "../data/ceeaus-qrels.txt" +query-path = "../queries.txt" # create this file corpus-type = "line-corpus" dataset = "20newsgroups" From 72cd69f2adc15a656af1abe74cbc46294de2c2ef Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 21 Apr 2015 19:15:43 -0500 Subject: [PATCH 098/481] add fastapprox header --- include/util/fastapprox.h | 1608 +++++++++++++++++++++++++++++++++++++ 1 file changed, 1608 insertions(+) create mode 100644 include/util/fastapprox.h diff --git a/include/util/fastapprox.h b/include/util/fastapprox.h new file mode 100644 index 000000000..8af382374 --- /dev/null +++ b/include/util/fastapprox.h @@ -0,0 +1,1608 @@ +/*=====================================================================* + * Copyright (C) 2012 Paul Mineiro * + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with * + * or without modification, are permitted provided that the * + * following conditions are met: * + * * + * * Redistributions of source code must retain the * + * above copyright notice, this list of conditions and * + * the following disclaimer. * + * * + * * Redistributions in binary form must reproduce the * + * above copyright notice, this list of conditions and * + * the following disclaimer in the documentation and/or * + * other materials provided with the distribution. * + * * + * * Neither the name of Paul Mineiro nor the names * + * of other contributors may be used to endorse or promote * + * products derived from this software without specific * + * prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * + * POSSIBILITY OF SUCH DAMAGE. * + * * + * Contact: Paul Mineiro * + *=====================================================================*/ + +#ifndef __CAST_H_ + +#ifdef __cplusplus +#define cast_uint32_t static_cast +#else +#define cast_uint32_t (uint32_t) +#endif + +#endif // __CAST_H_ +/*=====================================================================* + * Copyright (C) 2011 Paul Mineiro * + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with * + * or without modification, are permitted provided that the * + * following conditions are met: * + * * + * * Redistributions of source code must retain the * + * above copyright notice, this list of conditions and * + * the following disclaimer. * + * * + * * Redistributions in binary form must reproduce the * + * above copyright notice, this list of conditions and * + * the following disclaimer in the documentation and/or * + * other materials provided with the distribution. * + * * + * * Neither the name of Paul Mineiro nor the names * + * of other contributors may be used to endorse or promote * + * products derived from this software without specific * + * prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * + * POSSIBILITY OF SUCH DAMAGE. * + * * + * Contact: Paul Mineiro * + *=====================================================================*/ + +#ifndef __SSE_H_ +#define __SSE_H_ + +#ifdef __SSE2__ + +#include + +#ifdef __cplusplus +namespace { +#endif // __cplusplus + +typedef __m128 v4sf; +typedef __m128i v4si; + +#define v4si_to_v4sf _mm_cvtepi32_ps +#define v4sf_to_v4si _mm_cvttps_epi32 + +#define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) }) +#define v2dil(x) ((const v4si) { (x), (x) }) +#define v4sil(x) v2dil((((unsigned long long) (x)) << 32) | (x)) + +typedef union { v4sf f; float array[4]; } v4sfindexer; +#define v4sf_index(_findx, _findi) \ + ({ \ + v4sfindexer _findvx = { _findx } ; \ + _findvx.array[_findi]; \ + }) +typedef union { v4si i; int array[4]; } v4siindexer; +#define v4si_index(_iindx, _iindi) \ + ({ \ + v4siindexer _iindvx = { _iindx } ; \ + _iindvx.array[_iindi]; \ + }) + +typedef union { v4sf f; v4si i; } v4sfv4sipun; +#define v4sf_fabs(x) \ + ({ \ + v4sfv4sipun vx; \ + vx.f = x; \ + vx.i &= v4sil (0x7FFFFFFF); \ + vx.f; \ + }) + +#ifdef __cplusplus +} // end namespace +#endif // __cplusplus + +#endif // __SSE2__ + +#endif // __SSE_H_ +/*=====================================================================* + * Copyright (C) 2011 Paul Mineiro * + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with * + * or without modification, are permitted provided that the * + * following conditions are met: * + * * + * * Redistributions of source code must retain the * + * above copyright notice, this list of conditions and * + * the following disclaimer. * + * * + * * Redistributions in binary form must reproduce the * + * above copyright notice, this list of conditions and * + * the following disclaimer in the documentation and/or * + * other materials provided with the distribution. * + * * + * * Neither the name of Paul Mineiro nor the names * + * of other contributors may be used to endorse or promote * + * products derived from this software without specific * + * prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * + * POSSIBILITY OF SUCH DAMAGE. * + * * + * Contact: Paul Mineiro * + *=====================================================================*/ + +#ifndef __FAST_EXP_H_ +#define __FAST_EXP_H_ + +#include + +// Underflow of exponential is common practice in numerical routines, +// so handle it here. + +static inline float +fastpow2 (float p) +{ + float offset = (p < 0) ? 1.0f : 0.0f; + float clipp = (p < -126) ? -126.0f : p; + int w = clipp; + float z = clipp - w + offset; + union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) }; + + return v.f; +} + +static inline float +fastexp (float p) +{ + return fastpow2 (1.442695040f * p); +} + +static inline float +fasterpow2 (float p) +{ + float clipp = (p < -126) ? -126.0f : p; + union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 126.94269504f) ) }; + return v.f; +} + +static inline float +fasterexp (float p) +{ + return fasterpow2 (1.442695040f * p); +} + +#ifdef __SSE2__ + +static inline v4sf +vfastpow2 (const v4sf p) +{ + v4sf ltzero = _mm_cmplt_ps (p, v4sfl (0.0f)); + v4sf offset = _mm_and_ps (ltzero, v4sfl (1.0f)); + v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f)); + v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f))); + v4si w = v4sf_to_v4si (clipp); + v4sf z = clipp - v4si_to_v4sf (w) + offset; + + const v4sf c_121_2740838 = v4sfl (121.2740575f); + const v4sf c_27_7280233 = v4sfl (27.7280233f); + const v4sf c_4_84252568 = v4sfl (4.84252568f); + const v4sf c_1_49012907 = v4sfl (1.49012907f); + union { v4si i; v4sf f; } v = { + v4sf_to_v4si ( + v4sfl (1 << 23) * + (clipp + c_121_2740838 + c_27_7280233 / (c_4_84252568 - z) - c_1_49012907 * z) + ) + }; + + return v.f; +} + +static inline v4sf +vfastexp (const v4sf p) +{ + const v4sf c_invlog_2 = v4sfl (1.442695040f); + + return vfastpow2 (c_invlog_2 * p); +} + +static inline v4sf +vfasterpow2 (const v4sf p) +{ + const v4sf c_126_94269504 = v4sfl (126.94269504f); + v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f)); + v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f))); + union { v4si i; v4sf f; } v = { v4sf_to_v4si (v4sfl (1 << 23) * (clipp + c_126_94269504)) }; + return v.f; +} + +static inline v4sf +vfasterexp (const v4sf p) +{ + const v4sf c_invlog_2 = v4sfl (1.442695040f); + + return vfasterpow2 (c_invlog_2 * p); +} + +#endif //__SSE2__ + +#endif // __FAST_EXP_H_ +/*=====================================================================* + * Copyright (C) 2011 Paul Mineiro * + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with * + * or without modification, are permitted provided that the * + * following conditions are met: * + * * + * * Redistributions of source code must retain the * + * above copyright notice, this list of conditions and * + * the following disclaimer. * + * * + * * Redistributions in binary form must reproduce the * + * above copyright notice, this list of conditions and * + * the following disclaimer in the documentation and/or * + * other materials provided with the distribution. * + * * + * * Neither the name of Paul Mineiro nor the names * + * of other contributors may be used to endorse or promote * + * products derived from this software without specific * + * prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * + * POSSIBILITY OF SUCH DAMAGE. * + * * + * Contact: Paul Mineiro * + *=====================================================================*/ + +#ifndef __FAST_LOG_H_ +#define __FAST_LOG_H_ + +#include + +static inline float +fastlog2 (float x) +{ + union { float f; uint32_t i; } vx = { x }; + union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 }; + float y = vx.i; + y *= 1.1920928955078125e-7f; + + return y - 124.22551499f + - 1.498030302f * mx.f + - 1.72587999f / (0.3520887068f + mx.f); +} + +static inline float +fastlog (float x) +{ + return 0.69314718f * fastlog2 (x); +} + +static inline float +fasterlog2 (float x) +{ + union { float f; uint32_t i; } vx = { x }; + float y = vx.i; + y *= 1.1920928955078125e-7f; + return y - 126.94269504f; +} + +static inline float +fasterlog (float x) +{ +// return 0.69314718f * fasterlog2 (x); + + union { float f; uint32_t i; } vx = { x }; + float y = vx.i; + y *= 8.2629582881927490e-8f; + return y - 87.989971088f; +} + +#ifdef __SSE2__ + +static inline v4sf +vfastlog2 (v4sf x) +{ + union { v4sf f; v4si i; } vx = { x }; + union { v4si i; v4sf f; } mx; mx.i = (vx.i & v4sil (0x007FFFFF)) | v4sil (0x3f000000); + v4sf y = v4si_to_v4sf (vx.i); + y *= v4sfl (1.1920928955078125e-7f); + + const v4sf c_124_22551499 = v4sfl (124.22551499f); + const v4sf c_1_498030302 = v4sfl (1.498030302f); + const v4sf c_1_725877999 = v4sfl (1.72587999f); + const v4sf c_0_3520087068 = v4sfl (0.3520887068f); + + return y - c_124_22551499 + - c_1_498030302 * mx.f + - c_1_725877999 / (c_0_3520087068 + mx.f); +} + +static inline v4sf +vfastlog (v4sf x) +{ + const v4sf c_0_69314718 = v4sfl (0.69314718f); + + return c_0_69314718 * vfastlog2 (x); +} + +static inline v4sf +vfasterlog2 (v4sf x) +{ + union { v4sf f; v4si i; } vx = { x }; + v4sf y = v4si_to_v4sf (vx.i); + y *= v4sfl (1.1920928955078125e-7f); + + const v4sf c_126_94269504 = v4sfl (126.94269504f); + + return y - c_126_94269504; +} + +static inline v4sf +vfasterlog (v4sf x) +{ +// const v4sf c_0_69314718 = v4sfl (0.69314718f); +// +// return c_0_69314718 * vfasterlog2 (x); + + union { v4sf f; v4si i; } vx = { x }; + v4sf y = v4si_to_v4sf (vx.i); + y *= v4sfl (8.2629582881927490e-8f); + + const v4sf c_87_989971088 = v4sfl (87.989971088f); + + return y - c_87_989971088; +} + +#endif // __SSE2__ + +#endif // __FAST_LOG_H_ +/*=====================================================================* + * Copyright (C) 2011 Paul Mineiro * + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with * + * or without modification, are permitted provided that the * + * following conditions are met: * + * * + * * Redistributions of source code must retain the * + * above copyright notice, this list of conditions and * + * the following disclaimer. * + * * + * * Redistributions in binary form must reproduce the * + * above copyright notice, this list of conditions and * + * the following disclaimer in the documentation and/or * + * other materials provided with the distribution. * + * * + * * Neither the name of Paul Mineiro nor the names * + * of other contributors may be used to endorse or promote * + * products derived from this software without specific * + * prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * + * POSSIBILITY OF SUCH DAMAGE. * + * * + * Contact: Paul Mineiro * + *=====================================================================*/ + +#ifndef __FAST_ERF_H_ +#define __FAST_ERF_H_ + +#include +#include + +// fasterfc: not actually faster than erfcf(3) on newer machines! +// ... although vectorized version is interesting +// and fastererfc is very fast + +static inline float +fasterfc (float x) +{ + static const float k = 3.3509633149424609f; + static const float a = 0.07219054755431126f; + static const float b = 15.418191568719577f; + static const float c = 5.609846028328545f; + + union { float f; uint32_t i; } vc = { c * x }; + float xsq = x * x; + float xquad = xsq * xsq; + + vc.i |= 0x80000000; + + return 2.0f / (1.0f + fastpow2 (k * x)) - a * x * (b * xquad - 1.0f) * fasterpow2 (vc.f); +} + +static inline float +fastererfc (float x) +{ + static const float k = 3.3509633149424609f; + + return 2.0f / (1.0f + fasterpow2 (k * x)); +} + +// fasterf: not actually faster than erff(3) on newer machines! +// ... although vectorized version is interesting +// and fastererf is very fast + +static inline float +fasterf (float x) +{ + return 1.0f - fasterfc (x); +} + +static inline float +fastererf (float x) +{ + return 1.0f - fastererfc (x); +} + +static inline float +fastinverseerf (float x) +{ + static const float invk = 0.30004578719350504f; + static const float a = 0.020287853348211326f; + static const float b = 0.07236892874789555f; + static const float c = 0.9913030456864257f; + static const float d = 0.8059775923760193f; + + float xsq = x * x; + + return invk * fastlog2 ((1.0f + x) / (1.0f - x)) + + x * (a - b * xsq) / (c - d * xsq); +} + +static inline float +fasterinverseerf (float x) +{ + static const float invk = 0.30004578719350504f; + + return invk * fasterlog2 ((1.0f + x) / (1.0f - x)); +} + +#ifdef __SSE2__ + +static inline v4sf +vfasterfc (v4sf x) +{ + const v4sf k = v4sfl (3.3509633149424609f); + const v4sf a = v4sfl (0.07219054755431126f); + const v4sf b = v4sfl (15.418191568719577f); + const v4sf c = v4sfl (5.609846028328545f); + + union { v4sf f; v4si i; } vc; vc.f = c * x; + vc.i |= v4sil (0x80000000); + + v4sf xsq = x * x; + v4sf xquad = xsq * xsq; + + return v4sfl (2.0f) / (v4sfl (1.0f) + vfastpow2 (k * x)) - a * x * (b * xquad - v4sfl (1.0f)) * vfasterpow2 (vc.f); +} + +static inline v4sf +vfastererfc (const v4sf x) +{ + const v4sf k = v4sfl (3.3509633149424609f); + + return v4sfl (2.0f) / (v4sfl (1.0f) + vfasterpow2 (k * x)); +} + +static inline v4sf +vfasterf (v4sf x) +{ + return v4sfl (1.0f) - vfasterfc (x); +} + +static inline v4sf +vfastererf (const v4sf x) +{ + return v4sfl (1.0f) - vfastererfc (x); +} + +static inline v4sf +vfastinverseerf (v4sf x) +{ + const v4sf invk = v4sfl (0.30004578719350504f); + const v4sf a = v4sfl (0.020287853348211326f); + const v4sf b = v4sfl (0.07236892874789555f); + const v4sf c = v4sfl (0.9913030456864257f); + const v4sf d = v4sfl (0.8059775923760193f); + + v4sf xsq = x * x; + + return invk * vfastlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)) + + x * (a - b * xsq) / (c - d * xsq); +} + +static inline v4sf +vfasterinverseerf (v4sf x) +{ + const v4sf invk = v4sfl (0.30004578719350504f); + + return invk * vfasterlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)); +} + +#endif //__SSE2__ + +#endif // __FAST_ERF_H_ +/*=====================================================================* + * Copyright (C) 2011 Paul Mineiro * + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with * + * or without modification, are permitted provided that the * + * following conditions are met: * + * * + * * Redistributions of source code must retain the * + * above copyright notice, this list of conditions and * + * the following disclaimer. * + * * + * * Redistributions in binary form must reproduce the * + * above copyright notice, this list of conditions and * + * the following disclaimer in the documentation and/or * + * other materials provided with the distribution. * + * * + * * Neither the name of Paul Mineiro nor the names * + * of other contributors may be used to endorse or promote * + * products derived from this software without specific * + * prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * + * POSSIBILITY OF SUCH DAMAGE. * + * * + * Contact: Paul Mineiro * + *=====================================================================*/ + +#ifndef __FAST_GAMMA_H_ +#define __FAST_GAMMA_H_ + +#include + +/* gamma/digamma functions only work for positive inputs */ + +static inline float +fastlgamma (float x) +{ + float logterm = fastlog (x * (1.0f + x) * (2.0f + x)); + float xp3 = 3.0f + x; + + return - 2.081061466f + - x + + 0.0833333f / xp3 + - logterm + + (2.5f + x) * fastlog (xp3); +} + +static inline float +fasterlgamma (float x) +{ + return - 0.0810614667f + - x + - fasterlog (x) + + (0.5f + x) * fasterlog (1.0f + x); +} + +static inline float +fastdigamma (float x) +{ + float twopx = 2.0f + x; + float logterm = fastlog (twopx); + + return (-48.0f + x * (-157.0f + x * (-127.0f - 30.0f * x))) / + (12.0f * x * (1.0f + x) * twopx * twopx) + + logterm; +} + +static inline float +fasterdigamma (float x) +{ + float onepx = 1.0f + x; + + return -1.0f / x - 1.0f / (2 * onepx) + fasterlog (onepx); +} + +#ifdef __SSE2__ + +static inline v4sf +vfastlgamma (v4sf x) +{ + const v4sf c_1_0 = v4sfl (1.0f); + const v4sf c_2_0 = v4sfl (2.0f); + const v4sf c_3_0 = v4sfl (3.0f); + const v4sf c_2_081061466 = v4sfl (2.081061466f); + const v4sf c_0_0833333 = v4sfl (0.0833333f); + const v4sf c_2_5 = v4sfl (2.5f); + + v4sf logterm = vfastlog (x * (c_1_0 + x) * (c_2_0 + x)); + v4sf xp3 = c_3_0 + x; + + return - c_2_081061466 + - x + + c_0_0833333 / xp3 + - logterm + + (c_2_5 + x) * vfastlog (xp3); +} + +static inline v4sf +vfasterlgamma (v4sf x) +{ + const v4sf c_0_0810614667 = v4sfl (0.0810614667f); + const v4sf c_0_5 = v4sfl (0.5f); + const v4sf c_1 = v4sfl (1.0f); + + return - c_0_0810614667 + - x + - vfasterlog (x) + + (c_0_5 + x) * vfasterlog (c_1 + x); +} + +static inline v4sf +vfastdigamma (v4sf x) +{ + v4sf twopx = v4sfl (2.0f) + x; + v4sf logterm = vfastlog (twopx); + + return (v4sfl (-48.0f) + x * (v4sfl (-157.0f) + x * (v4sfl (-127.0f) - v4sfl (30.0f) * x))) / + (v4sfl (12.0f) * x * (v4sfl (1.0f) + x) * twopx * twopx) + + logterm; +} + +static inline v4sf +vfasterdigamma (v4sf x) +{ + const v4sf c_1_0 = v4sfl (1.0f); + const v4sf c_2_0 = v4sfl (2.0f); + v4sf onepx = c_1_0 + x; + + return -c_1_0 / x - c_1_0 / (c_2_0 * onepx) + vfasterlog (onepx); +} + +#endif //__SSE2__ + +#endif // __FAST_GAMMA_H_ +/*=====================================================================* + * Copyright (C) 2011 Paul Mineiro * + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with * + * or without modification, are permitted provided that the * + * following conditions are met: * + * * + * * Redistributions of source code must retain the * + * above copyright notice, this list of conditions and * + * the following disclaimer. * + * * + * * Redistributions in binary form must reproduce the * + * above copyright notice, this list of conditions and * + * the following disclaimer in the documentation and/or * + * other materials provided with the distribution. * + * * + * * Neither the name of Paul Mineiro nor the names * + * of other contributors may be used to endorse or promote * + * products derived from this software without specific * + * prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * + * POSSIBILITY OF SUCH DAMAGE. * + * * + * Contact: Paul Mineiro * + *=====================================================================*/ + +#ifndef __FAST_HYPERBOLIC_H_ +#define __FAST_HYPERBOLIC_H_ + +#include + +static inline float +fastsinh (float p) +{ + return 0.5f * (fastexp (p) - fastexp (-p)); +} + +static inline float +fastersinh (float p) +{ + return 0.5f * (fasterexp (p) - fasterexp (-p)); +} + +static inline float +fastcosh (float p) +{ + return 0.5f * (fastexp (p) + fastexp (-p)); +} + +static inline float +fastercosh (float p) +{ + return 0.5f * (fasterexp (p) + fasterexp (-p)); +} + +static inline float +fasttanh (float p) +{ + return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p)); +} + +static inline float +fastertanh (float p) +{ + return -1.0f + 2.0f / (1.0f + fasterexp (-2.0f * p)); +} + +#ifdef __SSE2__ + +static inline v4sf +vfastsinh (const v4sf p) +{ + const v4sf c_0_5 = v4sfl (0.5f); + + return c_0_5 * (vfastexp (p) - vfastexp (-p)); +} + +static inline v4sf +vfastersinh (const v4sf p) +{ + const v4sf c_0_5 = v4sfl (0.5f); + + return c_0_5 * (vfasterexp (p) - vfasterexp (-p)); +} + +static inline v4sf +vfastcosh (const v4sf p) +{ + const v4sf c_0_5 = v4sfl (0.5f); + + return c_0_5 * (vfastexp (p) + vfastexp (-p)); +} + +static inline v4sf +vfastercosh (const v4sf p) +{ + const v4sf c_0_5 = v4sfl (0.5f); + + return c_0_5 * (vfasterexp (p) + vfasterexp (-p)); +} + +static inline v4sf +vfasttanh (const v4sf p) +{ + const v4sf c_1 = v4sfl (1.0f); + const v4sf c_2 = v4sfl (2.0f); + + return -c_1 + c_2 / (c_1 + vfastexp (-c_2 * p)); +} + +static inline v4sf +vfastertanh (const v4sf p) +{ + const v4sf c_1 = v4sfl (1.0f); + const v4sf c_2 = v4sfl (2.0f); + + return -c_1 + c_2 / (c_1 + vfasterexp (-c_2 * p)); +} + +#endif //__SSE2__ + +#endif // __FAST_HYPERBOLIC_H_ +/*=====================================================================* + * Copyright (C) 2011 Paul Mineiro * + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with * + * or without modification, are permitted provided that the * + * following conditions are met: * + * * + * * Redistributions of source code must retain the * + * above copyright notice, this list of conditions and * + * the following disclaimer. * + * * + * * Redistributions in binary form must reproduce the * + * above copyright notice, this list of conditions and * + * the following disclaimer in the documentation and/or * + * other materials provided with the distribution. * + * * + * * Neither the name of Paul Mineiro nor the names * + * of other contributors may be used to endorse or promote * + * products derived from this software without specific * + * prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * + * POSSIBILITY OF SUCH DAMAGE. * + * * + * Contact: Paul Mineiro * + *=====================================================================*/ + +#ifndef __FAST_LAMBERT_W_H_ +#define __FAST_LAMBERT_W_H_ + +#include + +// these functions compute the upper branch aka W_0 + +static inline float +fastlambertw (float x) +{ + static const float threshold = 2.26445f; + + float c = (x < threshold) ? 1.546865557f : 1.0f; + float d = (x < threshold) ? 2.250366841f : 0.0f; + float a = (x < threshold) ? -0.737769969f : 0.0f; + + float logterm = fastlog (c * x + d); + float loglogterm = fastlog (logterm); + + float minusw = -a - logterm + loglogterm - loglogterm / logterm; + float expminusw = fastexp (minusw); + float xexpminusw = x * expminusw; + float pexpminusw = xexpminusw - minusw; + + return (2.0f * xexpminusw - minusw * (4.0f * xexpminusw - minusw * pexpminusw)) / + (2.0f + pexpminusw * (2.0f - minusw)); +} + +static inline float +fasterlambertw (float x) +{ + static const float threshold = 2.26445f; + + float c = (x < threshold) ? 1.546865557f : 1.0f; + float d = (x < threshold) ? 2.250366841f : 0.0f; + float a = (x < threshold) ? -0.737769969f : 0.0f; + + float logterm = fasterlog (c * x + d); + float loglogterm = fasterlog (logterm); + + float w = a + logterm - loglogterm + loglogterm / logterm; + float expw = fasterexp (-w); + + return (w * w + expw * x) / (1.0f + w); +} + +static inline float +fastlambertwexpx (float x) +{ + static const float k = 1.1765631309f; + static const float a = 0.94537622168f; + + float logarg = fmaxf (x, k); + float powarg = (x < k) ? a * (x - k) : 0; + + float logterm = fastlog (logarg); + float powterm = fasterpow2 (powarg); // don't need accuracy here + + float w = powterm * (logarg - logterm + logterm / logarg); + float logw = fastlog (w); + float p = x - logw; + + return w * (2.0f + p + w * (3.0f + 2.0f * p)) / + (2.0f - p + w * (5.0f + 2.0f * w)); +} + +static inline float +fasterlambertwexpx (float x) +{ + static const float k = 1.1765631309f; + static const float a = 0.94537622168f; + + float logarg = fmaxf (x, k); + float powarg = (x < k) ? a * (x - k) : 0; + + float logterm = fasterlog (logarg); + float powterm = fasterpow2 (powarg); + + float w = powterm * (logarg - logterm + logterm / logarg); + float logw = fasterlog (w); + + return w * (1.0f + x - logw) / (1.0f + w); +} + +#ifdef __SSE2__ + +static inline v4sf +vfastlambertw (v4sf x) +{ + const v4sf threshold = v4sfl (2.26445f); + + v4sf under = _mm_cmplt_ps (x, threshold); + v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)), + _mm_andnot_ps (under, v4sfl (1.0f))); + v4sf d = _mm_and_ps (under, v4sfl (2.250366841f)); + v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f)); + + v4sf logterm = vfastlog (c * x + d); + v4sf loglogterm = vfastlog (logterm); + + v4sf minusw = -a - logterm + loglogterm - loglogterm / logterm; + v4sf expminusw = vfastexp (minusw); + v4sf xexpminusw = x * expminusw; + v4sf pexpminusw = xexpminusw - minusw; + + return (v4sfl (2.0f) * xexpminusw - minusw * (v4sfl (4.0f) * xexpminusw - minusw * pexpminusw)) / + (v4sfl (2.0f) + pexpminusw * (v4sfl (2.0f) - minusw)); +} + +static inline v4sf +vfasterlambertw (v4sf x) +{ + const v4sf threshold = v4sfl (2.26445f); + + v4sf under = _mm_cmplt_ps (x, threshold); + v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)), + _mm_andnot_ps (under, v4sfl (1.0f))); + v4sf d = _mm_and_ps (under, v4sfl (2.250366841f)); + v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f)); + + v4sf logterm = vfasterlog (c * x + d); + v4sf loglogterm = vfasterlog (logterm); + + v4sf w = a + logterm - loglogterm + loglogterm / logterm; + v4sf expw = vfasterexp (-w); + + return (w * w + expw * x) / (v4sfl (1.0f) + w); +} + +static inline v4sf +vfastlambertwexpx (v4sf x) +{ + const v4sf k = v4sfl (1.1765631309f); + const v4sf a = v4sfl (0.94537622168f); + const v4sf two = v4sfl (2.0f); + const v4sf three = v4sfl (3.0f); + const v4sf five = v4sfl (5.0f); + + v4sf logarg = _mm_max_ps (x, k); + v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k)); + + v4sf logterm = vfastlog (logarg); + v4sf powterm = vfasterpow2 (powarg); // don't need accuracy here + + v4sf w = powterm * (logarg - logterm + logterm / logarg); + v4sf logw = vfastlog (w); + v4sf p = x - logw; + + return w * (two + p + w * (three + two * p)) / + (two - p + w * (five + two * w)); +} + +static inline v4sf +vfasterlambertwexpx (v4sf x) +{ + const v4sf k = v4sfl (1.1765631309f); + const v4sf a = v4sfl (0.94537622168f); + + v4sf logarg = _mm_max_ps (x, k); + v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k)); + + v4sf logterm = vfasterlog (logarg); + v4sf powterm = vfasterpow2 (powarg); + + v4sf w = powterm * (logarg - logterm + logterm / logarg); + v4sf logw = vfasterlog (w); + + return w * (v4sfl (1.0f) + x - logw) / (v4sfl (1.0f) + w); +} + +#endif // __SSE2__ + +#endif // __FAST_LAMBERT_W_H_ + +/*=====================================================================* + * Copyright (C) 2011 Paul Mineiro * + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with * + * or without modification, are permitted provided that the * + * following conditions are met: * + * * + * * Redistributions of source code must retain the * + * above copyright notice, this list of conditions and * + * the following disclaimer. * + * * + * * Redistributions in binary form must reproduce the * + * above copyright notice, this list of conditions and * + * the following disclaimer in the documentation and/or * + * other materials provided with the distribution. * + * * + * * Neither the name of Paul Mineiro nor the names * + * of other contributors may be used to endorse or promote * + * products derived from this software without specific * + * prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * + * POSSIBILITY OF SUCH DAMAGE. * + * * + * Contact: Paul Mineiro * + *=====================================================================*/ + +#ifndef __FAST_POW_H_ +#define __FAST_POW_H_ + +#include + +static inline float +fastpow (float x, + float p) +{ + return fastpow2 (p * fastlog2 (x)); +} + +static inline float +fasterpow (float x, + float p) +{ + return fasterpow2 (p * fasterlog2 (x)); +} + +#ifdef __SSE2__ + +static inline v4sf +vfastpow (const v4sf x, + const v4sf p) +{ + return vfastpow2 (p * vfastlog2 (x)); +} + +static inline v4sf +vfasterpow (const v4sf x, + const v4sf p) +{ + return vfasterpow2 (p * vfasterlog2 (x)); +} + +#endif //__SSE2__ + +#endif // __FAST_POW_H_ +/*=====================================================================* + * Copyright (C) 2011 Paul Mineiro * + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with * + * or without modification, are permitted provided that the * + * following conditions are met: * + * * + * * Redistributions of source code must retain the * + * above copyright notice, this list of conditions and * + * the following disclaimer. * + * * + * * Redistributions in binary form must reproduce the * + * above copyright notice, this list of conditions and * + * the following disclaimer in the documentation and/or * + * other materials provided with the distribution. * + * * + * * Neither the name of Paul Mineiro nor the names * + * of other contributors may be used to endorse or promote * + * products derived from this software without specific * + * prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * + * POSSIBILITY OF SUCH DAMAGE. * + * * + * Contact: Paul Mineiro * + *=====================================================================*/ + +#ifndef __FAST_SIGMOID_H_ +#define __FAST_SIGMOID_H_ + +#include + +static inline float +fastsigmoid (float x) +{ + return 1.0f / (1.0f + fastexp (-x)); +} + +static inline float +fastersigmoid (float x) +{ + return 1.0f / (1.0f + fasterexp (-x)); +} + +#ifdef __SSE2__ + +static inline v4sf +vfastsigmoid (const v4sf x) +{ + const v4sf c_1 = v4sfl (1.0f); + + return c_1 / (c_1 + vfastexp (-x)); +} + +static inline v4sf +vfastersigmoid (const v4sf x) +{ + const v4sf c_1 = v4sfl (1.0f); + + return c_1 / (c_1 + vfasterexp (-x)); +} + +#endif //__SSE2__ + +#endif // __FAST_SIGMOID_H_ +/*=====================================================================* + * Copyright (C) 2011 Paul Mineiro * + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with * + * or without modification, are permitted provided that the * + * following conditions are met: * + * * + * * Redistributions of source code must retain the * + * above copyright notice, this list of conditions and * + * the following disclaimer. * + * * + * * Redistributions in binary form must reproduce the * + * above copyright notice, this list of conditions and * + * the following disclaimer in the documentation and/or * + * other materials provided with the distribution. * + * * + * * Neither the name of Paul Mineiro nor the names * + * of other contributors may be used to endorse or promote * + * products derived from this software without specific * + * prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * + * POSSIBILITY OF SUCH DAMAGE. * + * * + * Contact: Paul Mineiro * + *=====================================================================*/ + +#ifndef __FAST_TRIG_H_ +#define __FAST_TRIG_H_ + +#include + +// http://www.devmaster.net/forums/showthread.php?t=5784 +// fast sine variants are for x \in [ -\pi, pi ] +// fast cosine variants are for x \in [ -\pi, pi ] +// fast tangent variants are for x \in [ -\pi / 2, pi / 2 ] +// "full" versions of functions handle the entire range of inputs +// although the range reduction technique used here will be hopelessly +// inaccurate for |x| >> 1000 +// +// WARNING: fastsinfull, fastcosfull, and fasttanfull can be slower than +// libc calls on older machines (!) and on newer machines are only +// slighly faster. however: +// * vectorized versions are competitive +// * faster full versions are competitive + +static inline float +fastsin (float x) +{ + static const float fouroverpi = 1.2732395447351627f; + static const float fouroverpisq = 0.40528473456935109f; + static const float q = 0.78444488374548933f; + union { float f; uint32_t i; } p = { 0.20363937680730309f }; + union { float f; uint32_t i; } r = { 0.015124940802184233f }; + union { float f; uint32_t i; } s = { -0.0032225901625579573f }; + + union { float f; uint32_t i; } vx = { x }; + uint32_t sign = vx.i & 0x80000000; + vx.i = vx.i & 0x7FFFFFFF; + + float qpprox = fouroverpi * x - fouroverpisq * x * vx.f; + float qpproxsq = qpprox * qpprox; + + p.i |= sign; + r.i |= sign; + s.i ^= sign; + + return q * qpprox + qpproxsq * (p.f + qpproxsq * (r.f + qpproxsq * s.f)); +} + +static inline float +fastersin (float x) +{ + static const float fouroverpi = 1.2732395447351627f; + static const float fouroverpisq = 0.40528473456935109f; + static const float q = 0.77633023248007499f; + union { float f; uint32_t i; } p = { 0.22308510060189463f }; + + union { float f; uint32_t i; } vx = { x }; + uint32_t sign = vx.i & 0x80000000; + vx.i &= 0x7FFFFFFF; + + float qpprox = fouroverpi * x - fouroverpisq * x * vx.f; + + p.i |= sign; + + return qpprox * (q + p.f * qpprox); +} + +static inline float +fastsinfull (float x) +{ + static const float twopi = 6.2831853071795865f; + static const float invtwopi = 0.15915494309189534f; + + int k = x * invtwopi; + float half = (x < 0) ? -0.5f : 0.5f; + return fastsin ((half + k) * twopi - x); +} + +static inline float +fastersinfull (float x) +{ + static const float twopi = 6.2831853071795865f; + static const float invtwopi = 0.15915494309189534f; + + int k = x * invtwopi; + float half = (x < 0) ? -0.5f : 0.5f; + return fastersin ((half + k) * twopi - x); +} + +static inline float +fastcos (float x) +{ + static const float halfpi = 1.5707963267948966f; + static const float halfpiminustwopi = -4.7123889803846899f; + float offset = (x > halfpi) ? halfpiminustwopi : halfpi; + return fastsin (x + offset); +} + +static inline float +fastercos (float x) +{ + static const float twooverpi = 0.63661977236758134f; + static const float p = 0.54641335845679634f; + + union { float f; uint32_t i; } vx = { x }; + vx.i &= 0x7FFFFFFF; + + float qpprox = 1.0f - twooverpi * vx.f; + + return qpprox + p * qpprox * (1.0f - qpprox * qpprox); +} + +static inline float +fastcosfull (float x) +{ + static const float halfpi = 1.5707963267948966f; + return fastsinfull (x + halfpi); +} + +static inline float +fastercosfull (float x) +{ + static const float halfpi = 1.5707963267948966f; + return fastersinfull (x + halfpi); +} + +static inline float +fasttan (float x) +{ + static const float halfpi = 1.5707963267948966f; + return fastsin (x) / fastsin (x + halfpi); +} + +static inline float +fastertan (float x) +{ + return fastersin (x) / fastercos (x); +} + +static inline float +fasttanfull (float x) +{ + static const float twopi = 6.2831853071795865f; + static const float invtwopi = 0.15915494309189534f; + + int k = x * invtwopi; + float half = (x < 0) ? -0.5f : 0.5f; + float xnew = x - (half + k) * twopi; + + return fastsin (xnew) / fastcos (xnew); +} + +static inline float +fastertanfull (float x) +{ + static const float twopi = 6.2831853071795865f; + static const float invtwopi = 0.15915494309189534f; + + int k = x * invtwopi; + float half = (x < 0) ? -0.5f : 0.5f; + float xnew = x - (half + k) * twopi; + + return fastersin (xnew) / fastercos (xnew); +} + +#ifdef __SSE2__ + +static inline v4sf +vfastsin (const v4sf x) +{ + const v4sf fouroverpi = v4sfl (1.2732395447351627f); + const v4sf fouroverpisq = v4sfl (0.40528473456935109f); + const v4sf q = v4sfl (0.78444488374548933f); + const v4sf p = v4sfl (0.20363937680730309f); + const v4sf r = v4sfl (0.015124940802184233f); + const v4sf s = v4sfl (-0.0032225901625579573f); + + union { v4sf f; v4si i; } vx = { x }; + v4si sign = vx.i & v4sil (0x80000000); + vx.i &= v4sil (0x7FFFFFFF); + + v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f; + v4sf qpproxsq = qpprox * qpprox; + union { v4sf f; v4si i; } vy; vy.f = qpproxsq * (p + qpproxsq * (r + qpproxsq * s)); + vy.i ^= sign; + + return q * qpprox + vy.f; +} + +static inline v4sf +vfastersin (const v4sf x) +{ + const v4sf fouroverpi = v4sfl (1.2732395447351627f); + const v4sf fouroverpisq = v4sfl (0.40528473456935109f); + const v4sf q = v4sfl (0.77633023248007499f); + const v4sf plit = v4sfl (0.22308510060189463f); + union { v4sf f; v4si i; } p = { plit }; + + union { v4sf f; v4si i; } vx = { x }; + v4si sign = vx.i & v4sil (0x80000000); + vx.i &= v4sil (0x7FFFFFFF); + + v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f; + + p.i |= sign; + + return qpprox * (q + p.f * qpprox); +} + +static inline v4sf +vfastsinfull (const v4sf x) +{ + const v4sf twopi = v4sfl (6.2831853071795865f); + const v4sf invtwopi = v4sfl (0.15915494309189534f); + + v4si k = v4sf_to_v4si (x * invtwopi); + + v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); + v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), + _mm_andnot_ps (ltzero, v4sfl (0.5f))); + + return vfastsin ((half + v4si_to_v4sf (k)) * twopi - x); +} + +static inline v4sf +vfastersinfull (const v4sf x) +{ + const v4sf twopi = v4sfl (6.2831853071795865f); + const v4sf invtwopi = v4sfl (0.15915494309189534f); + + v4si k = v4sf_to_v4si (x * invtwopi); + + v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); + v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), + _mm_andnot_ps (ltzero, v4sfl (0.5f))); + + return vfastersin ((half + v4si_to_v4sf (k)) * twopi - x); +} + +static inline v4sf +vfastcos (const v4sf x) +{ + const v4sf halfpi = v4sfl (1.5707963267948966f); + const v4sf halfpiminustwopi = v4sfl (-4.7123889803846899f); + v4sf lthalfpi = _mm_cmpnlt_ps (x, halfpi); + v4sf offset = _mm_or_ps (_mm_and_ps (lthalfpi, halfpiminustwopi), + _mm_andnot_ps (lthalfpi, halfpi)); + return vfastsin (x + offset); +} + +static inline v4sf +vfastercos (v4sf x) +{ + const v4sf twooverpi = v4sfl (0.63661977236758134f); + const v4sf p = v4sfl (0.54641335845679634); + + v4sf vx = v4sf_fabs (x); + v4sf qpprox = v4sfl (1.0f) - twooverpi * vx; + + return qpprox + p * qpprox * (v4sfl (1.0f) - qpprox * qpprox); +} + +static inline v4sf +vfastcosfull (const v4sf x) +{ + const v4sf halfpi = v4sfl (1.5707963267948966f); + return vfastsinfull (x + halfpi); +} + +static inline v4sf +vfastercosfull (const v4sf x) +{ + const v4sf halfpi = v4sfl (1.5707963267948966f); + return vfastersinfull (x + halfpi); +} + +static inline v4sf +vfasttan (const v4sf x) +{ + const v4sf halfpi = v4sfl (1.5707963267948966f); + return vfastsin (x) / vfastsin (x + halfpi); +} + +static inline v4sf +vfastertan (const v4sf x) +{ + return vfastersin (x) / vfastercos (x); +} + +static inline v4sf +vfasttanfull (const v4sf x) +{ + const v4sf twopi = v4sfl (6.2831853071795865f); + const v4sf invtwopi = v4sfl (0.15915494309189534f); + + v4si k = v4sf_to_v4si (x * invtwopi); + + v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); + v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), + _mm_andnot_ps (ltzero, v4sfl (0.5f))); + v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi; + + return vfastsin (xnew) / vfastcos (xnew); +} + +static inline v4sf +vfastertanfull (const v4sf x) +{ + const v4sf twopi = v4sfl (6.2831853071795865f); + const v4sf invtwopi = v4sfl (0.15915494309189534f); + + v4si k = v4sf_to_v4si (x * invtwopi); + + v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); + v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), + _mm_andnot_ps (ltzero, v4sfl (0.5f))); + v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi; + + return vfastersin (xnew) / vfastercos (xnew); +} + +#endif //__SSE2__ + +#endif // __FAST_TRIG_H_ From 6045cdce38cda254c2ac21a797a466767e7fe847 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 21 Apr 2015 19:56:34 -0500 Subject: [PATCH 099/481] remove most of code from fastapprox; causes too many warnings and errors --- include/util/fastapprox.h | 1598 ++----------------------------------- 1 file changed, 64 insertions(+), 1534 deletions(-) diff --git a/include/util/fastapprox.h b/include/util/fastapprox.h index 8af382374..4011b362e 100644 --- a/include/util/fastapprox.h +++ b/include/util/fastapprox.h @@ -38,1571 +38,101 @@ * Contact: Paul Mineiro * *=====================================================================*/ -#ifndef __CAST_H_ - -#ifdef __cplusplus -#define cast_uint32_t static_cast -#else -#define cast_uint32_t (uint32_t) -#endif - -#endif // __CAST_H_ -/*=====================================================================* - * Copyright (C) 2011 Paul Mineiro * - * All rights reserved. * - * * - * Redistribution and use in source and binary forms, with * - * or without modification, are permitted provided that the * - * following conditions are met: * - * * - * * Redistributions of source code must retain the * - * above copyright notice, this list of conditions and * - * the following disclaimer. * - * * - * * Redistributions in binary form must reproduce the * - * above copyright notice, this list of conditions and * - * the following disclaimer in the documentation and/or * - * other materials provided with the distribution. * - * * - * * Neither the name of Paul Mineiro nor the names * - * of other contributors may be used to endorse or promote * - * products derived from this software without specific * - * prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * - * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * - * POSSIBILITY OF SUCH DAMAGE. * - * * - * Contact: Paul Mineiro * - *=====================================================================*/ - -#ifndef __SSE_H_ -#define __SSE_H_ - -#ifdef __SSE2__ - -#include - -#ifdef __cplusplus -namespace { -#endif // __cplusplus - -typedef __m128 v4sf; -typedef __m128i v4si; - -#define v4si_to_v4sf _mm_cvtepi32_ps -#define v4sf_to_v4si _mm_cvttps_epi32 - -#define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) }) -#define v2dil(x) ((const v4si) { (x), (x) }) -#define v4sil(x) v2dil((((unsigned long long) (x)) << 32) | (x)) - -typedef union { v4sf f; float array[4]; } v4sfindexer; -#define v4sf_index(_findx, _findi) \ - ({ \ - v4sfindexer _findvx = { _findx } ; \ - _findvx.array[_findi]; \ - }) -typedef union { v4si i; int array[4]; } v4siindexer; -#define v4si_index(_iindx, _iindi) \ - ({ \ - v4siindexer _iindvx = { _iindx } ; \ - _iindvx.array[_iindi]; \ - }) - -typedef union { v4sf f; v4si i; } v4sfv4sipun; -#define v4sf_fabs(x) \ - ({ \ - v4sfv4sipun vx; \ - vx.f = x; \ - vx.i &= v4sil (0x7FFFFFFF); \ - vx.f; \ - }) - -#ifdef __cplusplus -} // end namespace -#endif // __cplusplus - -#endif // __SSE2__ - -#endif // __SSE_H_ -/*=====================================================================* - * Copyright (C) 2011 Paul Mineiro * - * All rights reserved. * - * * - * Redistribution and use in source and binary forms, with * - * or without modification, are permitted provided that the * - * following conditions are met: * - * * - * * Redistributions of source code must retain the * - * above copyright notice, this list of conditions and * - * the following disclaimer. * - * * - * * Redistributions in binary form must reproduce the * - * above copyright notice, this list of conditions and * - * the following disclaimer in the documentation and/or * - * other materials provided with the distribution. * - * * - * * Neither the name of Paul Mineiro nor the names * - * of other contributors may be used to endorse or promote * - * products derived from this software without specific * - * prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * - * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * - * POSSIBILITY OF SUCH DAMAGE. * - * * - * Contact: Paul Mineiro * - *=====================================================================*/ - -#ifndef __FAST_EXP_H_ -#define __FAST_EXP_H_ - +#ifndef FASTAPPROX_H_ +#define FASTAPPROX_H_ #include // Underflow of exponential is common practice in numerical routines, // so handle it here. -static inline float -fastpow2 (float p) -{ - float offset = (p < 0) ? 1.0f : 0.0f; - float clipp = (p < -126) ? -126.0f : p; - int w = clipp; - float z = clipp - w + offset; - union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) }; - - return v.f; -} - -static inline float -fastexp (float p) -{ - return fastpow2 (1.442695040f * p); -} - -static inline float -fasterpow2 (float p) +namespace fastapprox { - float clipp = (p < -126) ? -126.0f : p; - union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 126.94269504f) ) }; - return v.f; -} - -static inline float -fasterexp (float p) -{ - return fasterpow2 (1.442695040f * p); -} -#ifdef __SSE2__ - -static inline v4sf -vfastpow2 (const v4sf p) +static inline float fastpow2(float p) { - v4sf ltzero = _mm_cmplt_ps (p, v4sfl (0.0f)); - v4sf offset = _mm_and_ps (ltzero, v4sfl (1.0f)); - v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f)); - v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f))); - v4si w = v4sf_to_v4si (clipp); - v4sf z = clipp - v4si_to_v4sf (w) + offset; - - const v4sf c_121_2740838 = v4sfl (121.2740575f); - const v4sf c_27_7280233 = v4sfl (27.7280233f); - const v4sf c_4_84252568 = v4sfl (4.84252568f); - const v4sf c_1_49012907 = v4sfl (1.49012907f); - union { v4si i; v4sf f; } v = { - v4sf_to_v4si ( - v4sfl (1 << 23) * - (clipp + c_121_2740838 + c_27_7280233 / (c_4_84252568 - z) - c_1_49012907 * z) - ) - }; + float offset = (p < 0) ? 1.0f : 0.0f; + float clipp = (p < -126) ? -126.0f : p; + int w = clipp; + float z = clipp - w + offset; + union + { + uint32_t i; + float f; + } v = {static_cast((1 << 23) * (clipp + 121.2740575f + + 27.7280233f / (4.84252568f - z) + - 1.49012907f * z))}; - return v.f; + return v.f; } -static inline v4sf -vfastexp (const v4sf p) +static inline float fastexp(float p) { - const v4sf c_invlog_2 = v4sfl (1.442695040f); - - return vfastpow2 (c_invlog_2 * p); + return fastpow2(1.442695040f * p); } -static inline v4sf -vfasterpow2 (const v4sf p) +static inline float fasterpow2(float p) { - const v4sf c_126_94269504 = v4sfl (126.94269504f); - v4sf lt126 = _mm_cmplt_ps (p, v4sfl (-126.0f)); - v4sf clipp = _mm_or_ps (_mm_andnot_ps (lt126, p), _mm_and_ps (lt126, v4sfl (-126.0f))); - union { v4si i; v4sf f; } v = { v4sf_to_v4si (v4sfl (1 << 23) * (clipp + c_126_94269504)) }; - return v.f; + float clipp = (p < -126) ? -126.0f : p; + union + { + uint32_t i; + float f; + } v = {static_cast((1 << 23) * (clipp + 126.94269504f))}; + return v.f; } -static inline v4sf -vfasterexp (const v4sf p) +static inline float fasterexp(float p) { - const v4sf c_invlog_2 = v4sfl (1.442695040f); - - return vfasterpow2 (c_invlog_2 * p); + return fasterpow2(1.442695040f * p); } -#endif //__SSE2__ - -#endif // __FAST_EXP_H_ -/*=====================================================================* - * Copyright (C) 2011 Paul Mineiro * - * All rights reserved. * - * * - * Redistribution and use in source and binary forms, with * - * or without modification, are permitted provided that the * - * following conditions are met: * - * * - * * Redistributions of source code must retain the * - * above copyright notice, this list of conditions and * - * the following disclaimer. * - * * - * * Redistributions in binary form must reproduce the * - * above copyright notice, this list of conditions and * - * the following disclaimer in the documentation and/or * - * other materials provided with the distribution. * - * * - * * Neither the name of Paul Mineiro nor the names * - * of other contributors may be used to endorse or promote * - * products derived from this software without specific * - * prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * - * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * - * POSSIBILITY OF SUCH DAMAGE. * - * * - * Contact: Paul Mineiro * - *=====================================================================*/ - -#ifndef __FAST_LOG_H_ -#define __FAST_LOG_H_ - -#include - -static inline float -fastlog2 (float x) +static inline float fastlog2(float x) { - union { float f; uint32_t i; } vx = { x }; - union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 }; - float y = vx.i; - y *= 1.1920928955078125e-7f; + union + { + float f; + uint32_t i; + } vx = {x}; + union + { + uint32_t i; + float f; + } mx = {(vx.i & 0x007FFFFF) | 0x3f000000}; + float y = vx.i; + y *= 1.1920928955078125e-7f; - return y - 124.22551499f - - 1.498030302f * mx.f + return y - 124.22551499f - 1.498030302f * mx.f - 1.72587999f / (0.3520887068f + mx.f); } -static inline float -fastlog (float x) -{ - return 0.69314718f * fastlog2 (x); -} - -static inline float -fasterlog2 (float x) -{ - union { float f; uint32_t i; } vx = { x }; - float y = vx.i; - y *= 1.1920928955078125e-7f; - return y - 126.94269504f; -} - -static inline float -fasterlog (float x) -{ -// return 0.69314718f * fasterlog2 (x); - - union { float f; uint32_t i; } vx = { x }; - float y = vx.i; - y *= 8.2629582881927490e-8f; - return y - 87.989971088f; -} - -#ifdef __SSE2__ - -static inline v4sf -vfastlog2 (v4sf x) -{ - union { v4sf f; v4si i; } vx = { x }; - union { v4si i; v4sf f; } mx; mx.i = (vx.i & v4sil (0x007FFFFF)) | v4sil (0x3f000000); - v4sf y = v4si_to_v4sf (vx.i); - y *= v4sfl (1.1920928955078125e-7f); - - const v4sf c_124_22551499 = v4sfl (124.22551499f); - const v4sf c_1_498030302 = v4sfl (1.498030302f); - const v4sf c_1_725877999 = v4sfl (1.72587999f); - const v4sf c_0_3520087068 = v4sfl (0.3520887068f); - - return y - c_124_22551499 - - c_1_498030302 * mx.f - - c_1_725877999 / (c_0_3520087068 + mx.f); -} - -static inline v4sf -vfastlog (v4sf x) -{ - const v4sf c_0_69314718 = v4sfl (0.69314718f); - - return c_0_69314718 * vfastlog2 (x); -} - -static inline v4sf -vfasterlog2 (v4sf x) -{ - union { v4sf f; v4si i; } vx = { x }; - v4sf y = v4si_to_v4sf (vx.i); - y *= v4sfl (1.1920928955078125e-7f); - - const v4sf c_126_94269504 = v4sfl (126.94269504f); - - return y - c_126_94269504; -} - -static inline v4sf -vfasterlog (v4sf x) -{ -// const v4sf c_0_69314718 = v4sfl (0.69314718f); -// -// return c_0_69314718 * vfasterlog2 (x); - - union { v4sf f; v4si i; } vx = { x }; - v4sf y = v4si_to_v4sf (vx.i); - y *= v4sfl (8.2629582881927490e-8f); - - const v4sf c_87_989971088 = v4sfl (87.989971088f); - - return y - c_87_989971088; -} - -#endif // __SSE2__ - -#endif // __FAST_LOG_H_ -/*=====================================================================* - * Copyright (C) 2011 Paul Mineiro * - * All rights reserved. * - * * - * Redistribution and use in source and binary forms, with * - * or without modification, are permitted provided that the * - * following conditions are met: * - * * - * * Redistributions of source code must retain the * - * above copyright notice, this list of conditions and * - * the following disclaimer. * - * * - * * Redistributions in binary form must reproduce the * - * above copyright notice, this list of conditions and * - * the following disclaimer in the documentation and/or * - * other materials provided with the distribution. * - * * - * * Neither the name of Paul Mineiro nor the names * - * of other contributors may be used to endorse or promote * - * products derived from this software without specific * - * prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * - * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * - * POSSIBILITY OF SUCH DAMAGE. * - * * - * Contact: Paul Mineiro * - *=====================================================================*/ - -#ifndef __FAST_ERF_H_ -#define __FAST_ERF_H_ - -#include -#include - -// fasterfc: not actually faster than erfcf(3) on newer machines! -// ... although vectorized version is interesting -// and fastererfc is very fast - -static inline float -fasterfc (float x) -{ - static const float k = 3.3509633149424609f; - static const float a = 0.07219054755431126f; - static const float b = 15.418191568719577f; - static const float c = 5.609846028328545f; - - union { float f; uint32_t i; } vc = { c * x }; - float xsq = x * x; - float xquad = xsq * xsq; - - vc.i |= 0x80000000; - - return 2.0f / (1.0f + fastpow2 (k * x)) - a * x * (b * xquad - 1.0f) * fasterpow2 (vc.f); -} - -static inline float -fastererfc (float x) -{ - static const float k = 3.3509633149424609f; - - return 2.0f / (1.0f + fasterpow2 (k * x)); -} - -// fasterf: not actually faster than erff(3) on newer machines! -// ... although vectorized version is interesting -// and fastererf is very fast - -static inline float -fasterf (float x) -{ - return 1.0f - fasterfc (x); -} - -static inline float -fastererf (float x) +static inline float fastlog(float x) { - return 1.0f - fastererfc (x); + return 0.69314718f * fastlog2(x); } -static inline float -fastinverseerf (float x) +static inline float fasterlog2(float x) { - static const float invk = 0.30004578719350504f; - static const float a = 0.020287853348211326f; - static const float b = 0.07236892874789555f; - static const float c = 0.9913030456864257f; - static const float d = 0.8059775923760193f; - - float xsq = x * x; - - return invk * fastlog2 ((1.0f + x) / (1.0f - x)) - + x * (a - b * xsq) / (c - d * xsq); -} - -static inline float -fasterinverseerf (float x) -{ - static const float invk = 0.30004578719350504f; - - return invk * fasterlog2 ((1.0f + x) / (1.0f - x)); -} - -#ifdef __SSE2__ - -static inline v4sf -vfasterfc (v4sf x) -{ - const v4sf k = v4sfl (3.3509633149424609f); - const v4sf a = v4sfl (0.07219054755431126f); - const v4sf b = v4sfl (15.418191568719577f); - const v4sf c = v4sfl (5.609846028328545f); - - union { v4sf f; v4si i; } vc; vc.f = c * x; - vc.i |= v4sil (0x80000000); - - v4sf xsq = x * x; - v4sf xquad = xsq * xsq; - - return v4sfl (2.0f) / (v4sfl (1.0f) + vfastpow2 (k * x)) - a * x * (b * xquad - v4sfl (1.0f)) * vfasterpow2 (vc.f); + union + { + float f; + uint32_t i; + } vx = {x}; + float y = vx.i; + y *= 1.1920928955078125e-7f; + return y - 126.94269504f; } -static inline v4sf -vfastererfc (const v4sf x) +static inline float fasterlog(float x) { - const v4sf k = v4sfl (3.3509633149424609f); - - return v4sfl (2.0f) / (v4sfl (1.0f) + vfasterpow2 (k * x)); + union + { + float f; + uint32_t i; + } vx = {x}; + float y = vx.i; + y *= 8.2629582881927490e-8f; + return y - 87.989971088f; } -static inline v4sf -vfasterf (v4sf x) -{ - return v4sfl (1.0f) - vfasterfc (x); -} - -static inline v4sf -vfastererf (const v4sf x) -{ - return v4sfl (1.0f) - vfastererfc (x); -} - -static inline v4sf -vfastinverseerf (v4sf x) -{ - const v4sf invk = v4sfl (0.30004578719350504f); - const v4sf a = v4sfl (0.020287853348211326f); - const v4sf b = v4sfl (0.07236892874789555f); - const v4sf c = v4sfl (0.9913030456864257f); - const v4sf d = v4sfl (0.8059775923760193f); - - v4sf xsq = x * x; - - return invk * vfastlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)) - + x * (a - b * xsq) / (c - d * xsq); -} - -static inline v4sf -vfasterinverseerf (v4sf x) -{ - const v4sf invk = v4sfl (0.30004578719350504f); - - return invk * vfasterlog2 ((v4sfl (1.0f) + x) / (v4sfl (1.0f) - x)); -} - -#endif //__SSE2__ - -#endif // __FAST_ERF_H_ -/*=====================================================================* - * Copyright (C) 2011 Paul Mineiro * - * All rights reserved. * - * * - * Redistribution and use in source and binary forms, with * - * or without modification, are permitted provided that the * - * following conditions are met: * - * * - * * Redistributions of source code must retain the * - * above copyright notice, this list of conditions and * - * the following disclaimer. * - * * - * * Redistributions in binary form must reproduce the * - * above copyright notice, this list of conditions and * - * the following disclaimer in the documentation and/or * - * other materials provided with the distribution. * - * * - * * Neither the name of Paul Mineiro nor the names * - * of other contributors may be used to endorse or promote * - * products derived from this software without specific * - * prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * - * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * - * POSSIBILITY OF SUCH DAMAGE. * - * * - * Contact: Paul Mineiro * - *=====================================================================*/ - -#ifndef __FAST_GAMMA_H_ -#define __FAST_GAMMA_H_ - -#include - -/* gamma/digamma functions only work for positive inputs */ - -static inline float -fastlgamma (float x) -{ - float logterm = fastlog (x * (1.0f + x) * (2.0f + x)); - float xp3 = 3.0f + x; - - return - 2.081061466f - - x - + 0.0833333f / xp3 - - logterm - + (2.5f + x) * fastlog (xp3); -} - -static inline float -fasterlgamma (float x) -{ - return - 0.0810614667f - - x - - fasterlog (x) - + (0.5f + x) * fasterlog (1.0f + x); -} - -static inline float -fastdigamma (float x) -{ - float twopx = 2.0f + x; - float logterm = fastlog (twopx); - - return (-48.0f + x * (-157.0f + x * (-127.0f - 30.0f * x))) / - (12.0f * x * (1.0f + x) * twopx * twopx) - + logterm; -} - -static inline float -fasterdigamma (float x) -{ - float onepx = 1.0f + x; - - return -1.0f / x - 1.0f / (2 * onepx) + fasterlog (onepx); -} - -#ifdef __SSE2__ - -static inline v4sf -vfastlgamma (v4sf x) -{ - const v4sf c_1_0 = v4sfl (1.0f); - const v4sf c_2_0 = v4sfl (2.0f); - const v4sf c_3_0 = v4sfl (3.0f); - const v4sf c_2_081061466 = v4sfl (2.081061466f); - const v4sf c_0_0833333 = v4sfl (0.0833333f); - const v4sf c_2_5 = v4sfl (2.5f); - - v4sf logterm = vfastlog (x * (c_1_0 + x) * (c_2_0 + x)); - v4sf xp3 = c_3_0 + x; - - return - c_2_081061466 - - x - + c_0_0833333 / xp3 - - logterm - + (c_2_5 + x) * vfastlog (xp3); -} - -static inline v4sf -vfasterlgamma (v4sf x) -{ - const v4sf c_0_0810614667 = v4sfl (0.0810614667f); - const v4sf c_0_5 = v4sfl (0.5f); - const v4sf c_1 = v4sfl (1.0f); - - return - c_0_0810614667 - - x - - vfasterlog (x) - + (c_0_5 + x) * vfasterlog (c_1 + x); -} - -static inline v4sf -vfastdigamma (v4sf x) -{ - v4sf twopx = v4sfl (2.0f) + x; - v4sf logterm = vfastlog (twopx); - - return (v4sfl (-48.0f) + x * (v4sfl (-157.0f) + x * (v4sfl (-127.0f) - v4sfl (30.0f) * x))) / - (v4sfl (12.0f) * x * (v4sfl (1.0f) + x) * twopx * twopx) - + logterm; -} - -static inline v4sf -vfasterdigamma (v4sf x) -{ - const v4sf c_1_0 = v4sfl (1.0f); - const v4sf c_2_0 = v4sfl (2.0f); - v4sf onepx = c_1_0 + x; - - return -c_1_0 / x - c_1_0 / (c_2_0 * onepx) + vfasterlog (onepx); -} - -#endif //__SSE2__ - -#endif // __FAST_GAMMA_H_ -/*=====================================================================* - * Copyright (C) 2011 Paul Mineiro * - * All rights reserved. * - * * - * Redistribution and use in source and binary forms, with * - * or without modification, are permitted provided that the * - * following conditions are met: * - * * - * * Redistributions of source code must retain the * - * above copyright notice, this list of conditions and * - * the following disclaimer. * - * * - * * Redistributions in binary form must reproduce the * - * above copyright notice, this list of conditions and * - * the following disclaimer in the documentation and/or * - * other materials provided with the distribution. * - * * - * * Neither the name of Paul Mineiro nor the names * - * of other contributors may be used to endorse or promote * - * products derived from this software without specific * - * prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * - * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * - * POSSIBILITY OF SUCH DAMAGE. * - * * - * Contact: Paul Mineiro * - *=====================================================================*/ - -#ifndef __FAST_HYPERBOLIC_H_ -#define __FAST_HYPERBOLIC_H_ - -#include - -static inline float -fastsinh (float p) -{ - return 0.5f * (fastexp (p) - fastexp (-p)); -} - -static inline float -fastersinh (float p) -{ - return 0.5f * (fasterexp (p) - fasterexp (-p)); -} - -static inline float -fastcosh (float p) -{ - return 0.5f * (fastexp (p) + fastexp (-p)); -} - -static inline float -fastercosh (float p) -{ - return 0.5f * (fasterexp (p) + fasterexp (-p)); -} - -static inline float -fasttanh (float p) -{ - return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p)); -} - -static inline float -fastertanh (float p) -{ - return -1.0f + 2.0f / (1.0f + fasterexp (-2.0f * p)); -} - -#ifdef __SSE2__ - -static inline v4sf -vfastsinh (const v4sf p) -{ - const v4sf c_0_5 = v4sfl (0.5f); - - return c_0_5 * (vfastexp (p) - vfastexp (-p)); -} - -static inline v4sf -vfastersinh (const v4sf p) -{ - const v4sf c_0_5 = v4sfl (0.5f); - - return c_0_5 * (vfasterexp (p) - vfasterexp (-p)); -} - -static inline v4sf -vfastcosh (const v4sf p) -{ - const v4sf c_0_5 = v4sfl (0.5f); - - return c_0_5 * (vfastexp (p) + vfastexp (-p)); -} - -static inline v4sf -vfastercosh (const v4sf p) -{ - const v4sf c_0_5 = v4sfl (0.5f); - - return c_0_5 * (vfasterexp (p) + vfasterexp (-p)); -} - -static inline v4sf -vfasttanh (const v4sf p) -{ - const v4sf c_1 = v4sfl (1.0f); - const v4sf c_2 = v4sfl (2.0f); - - return -c_1 + c_2 / (c_1 + vfastexp (-c_2 * p)); -} - -static inline v4sf -vfastertanh (const v4sf p) -{ - const v4sf c_1 = v4sfl (1.0f); - const v4sf c_2 = v4sfl (2.0f); - - return -c_1 + c_2 / (c_1 + vfasterexp (-c_2 * p)); -} - -#endif //__SSE2__ - -#endif // __FAST_HYPERBOLIC_H_ -/*=====================================================================* - * Copyright (C) 2011 Paul Mineiro * - * All rights reserved. * - * * - * Redistribution and use in source and binary forms, with * - * or without modification, are permitted provided that the * - * following conditions are met: * - * * - * * Redistributions of source code must retain the * - * above copyright notice, this list of conditions and * - * the following disclaimer. * - * * - * * Redistributions in binary form must reproduce the * - * above copyright notice, this list of conditions and * - * the following disclaimer in the documentation and/or * - * other materials provided with the distribution. * - * * - * * Neither the name of Paul Mineiro nor the names * - * of other contributors may be used to endorse or promote * - * products derived from this software without specific * - * prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * - * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * - * POSSIBILITY OF SUCH DAMAGE. * - * * - * Contact: Paul Mineiro * - *=====================================================================*/ - -#ifndef __FAST_LAMBERT_W_H_ -#define __FAST_LAMBERT_W_H_ - -#include - -// these functions compute the upper branch aka W_0 - -static inline float -fastlambertw (float x) -{ - static const float threshold = 2.26445f; - - float c = (x < threshold) ? 1.546865557f : 1.0f; - float d = (x < threshold) ? 2.250366841f : 0.0f; - float a = (x < threshold) ? -0.737769969f : 0.0f; - - float logterm = fastlog (c * x + d); - float loglogterm = fastlog (logterm); - - float minusw = -a - logterm + loglogterm - loglogterm / logterm; - float expminusw = fastexp (minusw); - float xexpminusw = x * expminusw; - float pexpminusw = xexpminusw - minusw; - - return (2.0f * xexpminusw - minusw * (4.0f * xexpminusw - minusw * pexpminusw)) / - (2.0f + pexpminusw * (2.0f - minusw)); -} - -static inline float -fasterlambertw (float x) -{ - static const float threshold = 2.26445f; - - float c = (x < threshold) ? 1.546865557f : 1.0f; - float d = (x < threshold) ? 2.250366841f : 0.0f; - float a = (x < threshold) ? -0.737769969f : 0.0f; - - float logterm = fasterlog (c * x + d); - float loglogterm = fasterlog (logterm); - - float w = a + logterm - loglogterm + loglogterm / logterm; - float expw = fasterexp (-w); - - return (w * w + expw * x) / (1.0f + w); -} - -static inline float -fastlambertwexpx (float x) -{ - static const float k = 1.1765631309f; - static const float a = 0.94537622168f; - - float logarg = fmaxf (x, k); - float powarg = (x < k) ? a * (x - k) : 0; - - float logterm = fastlog (logarg); - float powterm = fasterpow2 (powarg); // don't need accuracy here - - float w = powterm * (logarg - logterm + logterm / logarg); - float logw = fastlog (w); - float p = x - logw; - - return w * (2.0f + p + w * (3.0f + 2.0f * p)) / - (2.0f - p + w * (5.0f + 2.0f * w)); -} - -static inline float -fasterlambertwexpx (float x) -{ - static const float k = 1.1765631309f; - static const float a = 0.94537622168f; - - float logarg = fmaxf (x, k); - float powarg = (x < k) ? a * (x - k) : 0; - - float logterm = fasterlog (logarg); - float powterm = fasterpow2 (powarg); - - float w = powterm * (logarg - logterm + logterm / logarg); - float logw = fasterlog (w); - - return w * (1.0f + x - logw) / (1.0f + w); -} - -#ifdef __SSE2__ - -static inline v4sf -vfastlambertw (v4sf x) -{ - const v4sf threshold = v4sfl (2.26445f); - - v4sf under = _mm_cmplt_ps (x, threshold); - v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)), - _mm_andnot_ps (under, v4sfl (1.0f))); - v4sf d = _mm_and_ps (under, v4sfl (2.250366841f)); - v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f)); - - v4sf logterm = vfastlog (c * x + d); - v4sf loglogterm = vfastlog (logterm); - - v4sf minusw = -a - logterm + loglogterm - loglogterm / logterm; - v4sf expminusw = vfastexp (minusw); - v4sf xexpminusw = x * expminusw; - v4sf pexpminusw = xexpminusw - minusw; - - return (v4sfl (2.0f) * xexpminusw - minusw * (v4sfl (4.0f) * xexpminusw - minusw * pexpminusw)) / - (v4sfl (2.0f) + pexpminusw * (v4sfl (2.0f) - minusw)); -} - -static inline v4sf -vfasterlambertw (v4sf x) -{ - const v4sf threshold = v4sfl (2.26445f); - - v4sf under = _mm_cmplt_ps (x, threshold); - v4sf c = _mm_or_ps (_mm_and_ps (under, v4sfl (1.546865557f)), - _mm_andnot_ps (under, v4sfl (1.0f))); - v4sf d = _mm_and_ps (under, v4sfl (2.250366841f)); - v4sf a = _mm_and_ps (under, v4sfl (-0.737769969f)); - - v4sf logterm = vfasterlog (c * x + d); - v4sf loglogterm = vfasterlog (logterm); - - v4sf w = a + logterm - loglogterm + loglogterm / logterm; - v4sf expw = vfasterexp (-w); - - return (w * w + expw * x) / (v4sfl (1.0f) + w); -} - -static inline v4sf -vfastlambertwexpx (v4sf x) -{ - const v4sf k = v4sfl (1.1765631309f); - const v4sf a = v4sfl (0.94537622168f); - const v4sf two = v4sfl (2.0f); - const v4sf three = v4sfl (3.0f); - const v4sf five = v4sfl (5.0f); - - v4sf logarg = _mm_max_ps (x, k); - v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k)); - - v4sf logterm = vfastlog (logarg); - v4sf powterm = vfasterpow2 (powarg); // don't need accuracy here - - v4sf w = powterm * (logarg - logterm + logterm / logarg); - v4sf logw = vfastlog (w); - v4sf p = x - logw; - - return w * (two + p + w * (three + two * p)) / - (two - p + w * (five + two * w)); -} - -static inline v4sf -vfasterlambertwexpx (v4sf x) -{ - const v4sf k = v4sfl (1.1765631309f); - const v4sf a = v4sfl (0.94537622168f); - - v4sf logarg = _mm_max_ps (x, k); - v4sf powarg = _mm_and_ps (_mm_cmplt_ps (x, k), a * (x - k)); - - v4sf logterm = vfasterlog (logarg); - v4sf powterm = vfasterpow2 (powarg); - - v4sf w = powterm * (logarg - logterm + logterm / logarg); - v4sf logw = vfasterlog (w); - - return w * (v4sfl (1.0f) + x - logw) / (v4sfl (1.0f) + w); -} - -#endif // __SSE2__ - -#endif // __FAST_LAMBERT_W_H_ - -/*=====================================================================* - * Copyright (C) 2011 Paul Mineiro * - * All rights reserved. * - * * - * Redistribution and use in source and binary forms, with * - * or without modification, are permitted provided that the * - * following conditions are met: * - * * - * * Redistributions of source code must retain the * - * above copyright notice, this list of conditions and * - * the following disclaimer. * - * * - * * Redistributions in binary form must reproduce the * - * above copyright notice, this list of conditions and * - * the following disclaimer in the documentation and/or * - * other materials provided with the distribution. * - * * - * * Neither the name of Paul Mineiro nor the names * - * of other contributors may be used to endorse or promote * - * products derived from this software without specific * - * prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * - * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * - * POSSIBILITY OF SUCH DAMAGE. * - * * - * Contact: Paul Mineiro * - *=====================================================================*/ - -#ifndef __FAST_POW_H_ -#define __FAST_POW_H_ - -#include - -static inline float -fastpow (float x, - float p) -{ - return fastpow2 (p * fastlog2 (x)); -} - -static inline float -fasterpow (float x, - float p) -{ - return fasterpow2 (p * fasterlog2 (x)); -} - -#ifdef __SSE2__ - -static inline v4sf -vfastpow (const v4sf x, - const v4sf p) -{ - return vfastpow2 (p * vfastlog2 (x)); -} - -static inline v4sf -vfasterpow (const v4sf x, - const v4sf p) -{ - return vfasterpow2 (p * vfasterlog2 (x)); -} - -#endif //__SSE2__ - -#endif // __FAST_POW_H_ -/*=====================================================================* - * Copyright (C) 2011 Paul Mineiro * - * All rights reserved. * - * * - * Redistribution and use in source and binary forms, with * - * or without modification, are permitted provided that the * - * following conditions are met: * - * * - * * Redistributions of source code must retain the * - * above copyright notice, this list of conditions and * - * the following disclaimer. * - * * - * * Redistributions in binary form must reproduce the * - * above copyright notice, this list of conditions and * - * the following disclaimer in the documentation and/or * - * other materials provided with the distribution. * - * * - * * Neither the name of Paul Mineiro nor the names * - * of other contributors may be used to endorse or promote * - * products derived from this software without specific * - * prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * - * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * - * POSSIBILITY OF SUCH DAMAGE. * - * * - * Contact: Paul Mineiro * - *=====================================================================*/ - -#ifndef __FAST_SIGMOID_H_ -#define __FAST_SIGMOID_H_ - -#include - -static inline float -fastsigmoid (float x) -{ - return 1.0f / (1.0f + fastexp (-x)); -} - -static inline float -fastersigmoid (float x) -{ - return 1.0f / (1.0f + fasterexp (-x)); -} - -#ifdef __SSE2__ - -static inline v4sf -vfastsigmoid (const v4sf x) -{ - const v4sf c_1 = v4sfl (1.0f); - - return c_1 / (c_1 + vfastexp (-x)); -} - -static inline v4sf -vfastersigmoid (const v4sf x) -{ - const v4sf c_1 = v4sfl (1.0f); - - return c_1 / (c_1 + vfasterexp (-x)); -} - -#endif //__SSE2__ - -#endif // __FAST_SIGMOID_H_ -/*=====================================================================* - * Copyright (C) 2011 Paul Mineiro * - * All rights reserved. * - * * - * Redistribution and use in source and binary forms, with * - * or without modification, are permitted provided that the * - * following conditions are met: * - * * - * * Redistributions of source code must retain the * - * above copyright notice, this list of conditions and * - * the following disclaimer. * - * * - * * Redistributions in binary form must reproduce the * - * above copyright notice, this list of conditions and * - * the following disclaimer in the documentation and/or * - * other materials provided with the distribution. * - * * - * * Neither the name of Paul Mineiro nor the names * - * of other contributors may be used to endorse or promote * - * products derived from this software without specific * - * prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * - * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * - * POSSIBILITY OF SUCH DAMAGE. * - * * - * Contact: Paul Mineiro * - *=====================================================================*/ - -#ifndef __FAST_TRIG_H_ -#define __FAST_TRIG_H_ - -#include - -// http://www.devmaster.net/forums/showthread.php?t=5784 -// fast sine variants are for x \in [ -\pi, pi ] -// fast cosine variants are for x \in [ -\pi, pi ] -// fast tangent variants are for x \in [ -\pi / 2, pi / 2 ] -// "full" versions of functions handle the entire range of inputs -// although the range reduction technique used here will be hopelessly -// inaccurate for |x| >> 1000 -// -// WARNING: fastsinfull, fastcosfull, and fasttanfull can be slower than -// libc calls on older machines (!) and on newer machines are only -// slighly faster. however: -// * vectorized versions are competitive -// * faster full versions are competitive - -static inline float -fastsin (float x) -{ - static const float fouroverpi = 1.2732395447351627f; - static const float fouroverpisq = 0.40528473456935109f; - static const float q = 0.78444488374548933f; - union { float f; uint32_t i; } p = { 0.20363937680730309f }; - union { float f; uint32_t i; } r = { 0.015124940802184233f }; - union { float f; uint32_t i; } s = { -0.0032225901625579573f }; - - union { float f; uint32_t i; } vx = { x }; - uint32_t sign = vx.i & 0x80000000; - vx.i = vx.i & 0x7FFFFFFF; - - float qpprox = fouroverpi * x - fouroverpisq * x * vx.f; - float qpproxsq = qpprox * qpprox; - - p.i |= sign; - r.i |= sign; - s.i ^= sign; - - return q * qpprox + qpproxsq * (p.f + qpproxsq * (r.f + qpproxsq * s.f)); -} - -static inline float -fastersin (float x) -{ - static const float fouroverpi = 1.2732395447351627f; - static const float fouroverpisq = 0.40528473456935109f; - static const float q = 0.77633023248007499f; - union { float f; uint32_t i; } p = { 0.22308510060189463f }; - - union { float f; uint32_t i; } vx = { x }; - uint32_t sign = vx.i & 0x80000000; - vx.i &= 0x7FFFFFFF; - - float qpprox = fouroverpi * x - fouroverpisq * x * vx.f; - - p.i |= sign; - - return qpprox * (q + p.f * qpprox); -} - -static inline float -fastsinfull (float x) -{ - static const float twopi = 6.2831853071795865f; - static const float invtwopi = 0.15915494309189534f; - - int k = x * invtwopi; - float half = (x < 0) ? -0.5f : 0.5f; - return fastsin ((half + k) * twopi - x); -} - -static inline float -fastersinfull (float x) -{ - static const float twopi = 6.2831853071795865f; - static const float invtwopi = 0.15915494309189534f; - - int k = x * invtwopi; - float half = (x < 0) ? -0.5f : 0.5f; - return fastersin ((half + k) * twopi - x); -} - -static inline float -fastcos (float x) -{ - static const float halfpi = 1.5707963267948966f; - static const float halfpiminustwopi = -4.7123889803846899f; - float offset = (x > halfpi) ? halfpiminustwopi : halfpi; - return fastsin (x + offset); -} - -static inline float -fastercos (float x) -{ - static const float twooverpi = 0.63661977236758134f; - static const float p = 0.54641335845679634f; - - union { float f; uint32_t i; } vx = { x }; - vx.i &= 0x7FFFFFFF; - - float qpprox = 1.0f - twooverpi * vx.f; - - return qpprox + p * qpprox * (1.0f - qpprox * qpprox); -} - -static inline float -fastcosfull (float x) -{ - static const float halfpi = 1.5707963267948966f; - return fastsinfull (x + halfpi); -} - -static inline float -fastercosfull (float x) -{ - static const float halfpi = 1.5707963267948966f; - return fastersinfull (x + halfpi); -} - -static inline float -fasttan (float x) -{ - static const float halfpi = 1.5707963267948966f; - return fastsin (x) / fastsin (x + halfpi); -} - -static inline float -fastertan (float x) -{ - return fastersin (x) / fastercos (x); -} - -static inline float -fasttanfull (float x) -{ - static const float twopi = 6.2831853071795865f; - static const float invtwopi = 0.15915494309189534f; - - int k = x * invtwopi; - float half = (x < 0) ? -0.5f : 0.5f; - float xnew = x - (half + k) * twopi; - - return fastsin (xnew) / fastcos (xnew); -} - -static inline float -fastertanfull (float x) -{ - static const float twopi = 6.2831853071795865f; - static const float invtwopi = 0.15915494309189534f; - - int k = x * invtwopi; - float half = (x < 0) ? -0.5f : 0.5f; - float xnew = x - (half + k) * twopi; - - return fastersin (xnew) / fastercos (xnew); -} - -#ifdef __SSE2__ - -static inline v4sf -vfastsin (const v4sf x) -{ - const v4sf fouroverpi = v4sfl (1.2732395447351627f); - const v4sf fouroverpisq = v4sfl (0.40528473456935109f); - const v4sf q = v4sfl (0.78444488374548933f); - const v4sf p = v4sfl (0.20363937680730309f); - const v4sf r = v4sfl (0.015124940802184233f); - const v4sf s = v4sfl (-0.0032225901625579573f); - - union { v4sf f; v4si i; } vx = { x }; - v4si sign = vx.i & v4sil (0x80000000); - vx.i &= v4sil (0x7FFFFFFF); - - v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f; - v4sf qpproxsq = qpprox * qpprox; - union { v4sf f; v4si i; } vy; vy.f = qpproxsq * (p + qpproxsq * (r + qpproxsq * s)); - vy.i ^= sign; - - return q * qpprox + vy.f; -} - -static inline v4sf -vfastersin (const v4sf x) -{ - const v4sf fouroverpi = v4sfl (1.2732395447351627f); - const v4sf fouroverpisq = v4sfl (0.40528473456935109f); - const v4sf q = v4sfl (0.77633023248007499f); - const v4sf plit = v4sfl (0.22308510060189463f); - union { v4sf f; v4si i; } p = { plit }; - - union { v4sf f; v4si i; } vx = { x }; - v4si sign = vx.i & v4sil (0x80000000); - vx.i &= v4sil (0x7FFFFFFF); - - v4sf qpprox = fouroverpi * x - fouroverpisq * x * vx.f; - - p.i |= sign; - - return qpprox * (q + p.f * qpprox); -} - -static inline v4sf -vfastsinfull (const v4sf x) -{ - const v4sf twopi = v4sfl (6.2831853071795865f); - const v4sf invtwopi = v4sfl (0.15915494309189534f); - - v4si k = v4sf_to_v4si (x * invtwopi); - - v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); - v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), - _mm_andnot_ps (ltzero, v4sfl (0.5f))); - - return vfastsin ((half + v4si_to_v4sf (k)) * twopi - x); -} - -static inline v4sf -vfastersinfull (const v4sf x) -{ - const v4sf twopi = v4sfl (6.2831853071795865f); - const v4sf invtwopi = v4sfl (0.15915494309189534f); - - v4si k = v4sf_to_v4si (x * invtwopi); - - v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); - v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), - _mm_andnot_ps (ltzero, v4sfl (0.5f))); - - return vfastersin ((half + v4si_to_v4sf (k)) * twopi - x); -} - -static inline v4sf -vfastcos (const v4sf x) -{ - const v4sf halfpi = v4sfl (1.5707963267948966f); - const v4sf halfpiminustwopi = v4sfl (-4.7123889803846899f); - v4sf lthalfpi = _mm_cmpnlt_ps (x, halfpi); - v4sf offset = _mm_or_ps (_mm_and_ps (lthalfpi, halfpiminustwopi), - _mm_andnot_ps (lthalfpi, halfpi)); - return vfastsin (x + offset); -} - -static inline v4sf -vfastercos (v4sf x) -{ - const v4sf twooverpi = v4sfl (0.63661977236758134f); - const v4sf p = v4sfl (0.54641335845679634); - - v4sf vx = v4sf_fabs (x); - v4sf qpprox = v4sfl (1.0f) - twooverpi * vx; - - return qpprox + p * qpprox * (v4sfl (1.0f) - qpprox * qpprox); -} - -static inline v4sf -vfastcosfull (const v4sf x) -{ - const v4sf halfpi = v4sfl (1.5707963267948966f); - return vfastsinfull (x + halfpi); -} - -static inline v4sf -vfastercosfull (const v4sf x) -{ - const v4sf halfpi = v4sfl (1.5707963267948966f); - return vfastersinfull (x + halfpi); -} - -static inline v4sf -vfasttan (const v4sf x) -{ - const v4sf halfpi = v4sfl (1.5707963267948966f); - return vfastsin (x) / vfastsin (x + halfpi); -} - -static inline v4sf -vfastertan (const v4sf x) -{ - return vfastersin (x) / vfastercos (x); -} - -static inline v4sf -vfasttanfull (const v4sf x) -{ - const v4sf twopi = v4sfl (6.2831853071795865f); - const v4sf invtwopi = v4sfl (0.15915494309189534f); - - v4si k = v4sf_to_v4si (x * invtwopi); - - v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); - v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), - _mm_andnot_ps (ltzero, v4sfl (0.5f))); - v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi; - - return vfastsin (xnew) / vfastcos (xnew); -} - -static inline v4sf -vfastertanfull (const v4sf x) -{ - const v4sf twopi = v4sfl (6.2831853071795865f); - const v4sf invtwopi = v4sfl (0.15915494309189534f); - - v4si k = v4sf_to_v4si (x * invtwopi); - - v4sf ltzero = _mm_cmplt_ps (x, v4sfl (0.0f)); - v4sf half = _mm_or_ps (_mm_and_ps (ltzero, v4sfl (-0.5f)), - _mm_andnot_ps (ltzero, v4sfl (0.5f))); - v4sf xnew = x - (half + v4si_to_v4sf (k)) * twopi; - - return vfastersin (xnew) / vfastercos (xnew); -} - -#endif //__SSE2__ - -#endif // __FAST_TRIG_H_ +} // namespace fastapprox +#endif From e8de71498343f33823a039459ed19a6b6b70b934 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 21 Apr 2015 19:57:54 -0500 Subject: [PATCH 100/481] use fastapprox::fasterlog in all rankers --- src/index/ranker/lm_ranker.cpp | 8 ++++---- src/index/ranker/okapi_bm25.cpp | 3 ++- src/index/ranker/pivoted_length.cpp | 8 +++++--- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/index/ranker/lm_ranker.cpp b/src/index/ranker/lm_ranker.cpp index ce8c1cd8e..755a785b9 100644 --- a/src/index/ranker/lm_ranker.cpp +++ b/src/index/ranker/lm_ranker.cpp @@ -5,6 +5,7 @@ #include #include "corpus/document.h" +#include "util/fastapprox.h" #include "index/score_data.h" #include "index/ranker/lm_ranker.h" @@ -19,14 +20,13 @@ double language_model_ranker::score_one(const score_data& sd) { double ps = smoothed_prob(sd); double pc = static_cast(sd.corpus_term_count) / sd.total_terms; - - return sd.query_term_count * std::log(ps / (doc_constant(sd) * pc)); + return sd.query_term_count + * fastapprox::fasterlog(ps / (doc_constant(sd) * pc)); } double language_model_ranker::initial_score(const score_data& sd) const { - return sd.query.length() * std::log(doc_constant(sd)); + return sd.query.length() * fastapprox::fasterlog(doc_constant(sd)); } - } } diff --git a/src/index/ranker/okapi_bm25.cpp b/src/index/ranker/okapi_bm25.cpp index c31f731af..325d324e1 100644 --- a/src/index/ranker/okapi_bm25.cpp +++ b/src/index/ranker/okapi_bm25.cpp @@ -7,6 +7,7 @@ #include "index/inverted_index.h" #include "index/ranker/okapi_bm25.h" #include "index/score_data.h" +#include "util/fastapprox.h" namespace meta { @@ -25,7 +26,7 @@ double okapi_bm25::score_one(const score_data& sd) double doc_len = sd.idx.doc_size(sd.d_id); // add 1.0 to the IDF to ensure that the result is positive - double IDF = std::log( + double IDF = fastapprox::fasterlog( 1.0 + (sd.num_docs - sd.doc_count + 0.5) / (sd.doc_count + 0.5)); double TF = ((k1_ + 1.0) * sd.doc_term_count) diff --git a/src/index/ranker/pivoted_length.cpp b/src/index/ranker/pivoted_length.cpp index 2e03e9930..96cace9a8 100644 --- a/src/index/ranker/pivoted_length.cpp +++ b/src/index/ranker/pivoted_length.cpp @@ -6,6 +6,7 @@ #include "index/inverted_index.h" #include "index/ranker/pivoted_length.h" #include "index/score_data.h" +#include "util/fastapprox.h" namespace meta { @@ -22,10 +23,11 @@ pivoted_length::pivoted_length(double s) : s_{s} double pivoted_length::score_one(const score_data& sd) { double doc_len = sd.idx.doc_size(sd.d_id); - double TF = 1 + log(1 + log(sd.doc_term_count)); + double TF = 1 + fastapprox::fasterlog( + 1 + fastapprox::fasterlog(sd.doc_term_count)); double norm = (1 - s_) + s_ * (doc_len / sd.avg_dl); - double IDF = log((sd.num_docs + 1) / (0.5 + sd.doc_count)); - + double IDF + = fastapprox::fasterlog((sd.num_docs + 1) / (0.5 + sd.doc_count)); return TF / norm * sd.query_term_count * IDF; } From 526a7aa5d0b28487d2b02113b936fde2ce01a1d1 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 21 Apr 2015 23:16:14 -0500 Subject: [PATCH 101/481] Switch to not using caches for inverted index in examples. Since ranking now uses stream_for() instead of search_primary(), the cache doesn't really help anything anymore. --- src/classify/tools/classify.cpp | 3 +- src/index/tools/interactive-search.cpp | 7 +- src/index/tools/query-runner.cpp | 7 +- src/index/tools/search.cpp | 7 +- src/test/classifier_test.cpp | 3 +- src/test/inverted_index_test.cpp | 152 +++++++++++++------------ src/test/ir_eval_test.cpp | 5 +- src/test/ranker_test.cpp | 3 +- 8 files changed, 89 insertions(+), 98 deletions(-) diff --git a/src/classify/tools/classify.cpp b/src/classify/tools/classify.cpp index bc90fc375..794190d90 100644 --- a/src/classify/tools/classify.cpp +++ b/src/classify/tools/classify.cpp @@ -100,8 +100,7 @@ int main(int argc, char* argv[]) auto classifier_method = *class_config->get_as("method"); if (classifier_method == "knn" || classifier_method == "nearest-centroid") { - auto i_idx - = index::make_index(argv[1], 10000); + auto i_idx = index::make_index(argv[1]); classifier = classify::make_classifier(*class_config, f_idx, i_idx); } else diff --git a/src/index/tools/interactive-search.cpp b/src/index/tools/interactive-search.cpp index 764cfa0cb..17aa910fb 100644 --- a/src/index/tools/interactive-search.cpp +++ b/src/index/tools/interactive-search.cpp @@ -49,11 +49,8 @@ int main(int argc, char* argv[]) parser::register_analyzers(); sequence::register_analyzers(); - // Create an inverted index using a splay cache. The arguments forwarded - // to make_index are the config file for the index and any parameters - // for the cache. In this case, we set the maximum number of nodes in - // the splay_cache to be 10000. - auto idx = index::make_index(argv[1], 10000); + // Create an inverted index based on the config file. + auto idx = index::make_index(argv[1]); // Create a ranking class based on the config file. auto config = cpptoml::parse_file(argv[1]); diff --git a/src/index/tools/query-runner.cpp b/src/index/tools/query-runner.cpp index 8fa0f892c..3b65e79f4 100644 --- a/src/index/tools/query-runner.cpp +++ b/src/index/tools/query-runner.cpp @@ -35,11 +35,8 @@ int main(int argc, char* argv[]) parser::register_analyzers(); sequence::register_analyzers(); - // Create an inverted index using a DBLRU cache. The arguments forwarded to - // make_index are the config file for the index and any parameters for the - // cache. In this case, we set the maximum hash table size for the - // dblru_cache to be 10000. - auto idx = index::make_index(argv[1], 10000); + // Create an inverted index based on the config file + auto idx = index::make_index(argv[1]); // Create a ranking class based on the config file. auto config = cpptoml::parse_file(argv[1]); diff --git a/src/index/tools/search.cpp b/src/index/tools/search.cpp index 39da973da..70f9854fd 100644 --- a/src/index/tools/search.cpp +++ b/src/index/tools/search.cpp @@ -38,11 +38,8 @@ int main(int argc, char* argv[]) parser::register_analyzers(); sequence::register_analyzers(); - // Create an inverted index using a DBLRU cache. The arguments forwarded to - // make_index are the config file for the index and any parameters for the - // cache. In this case, we set the maximum hash table size for the - // dblru_cache to be 10000. - auto idx = index::make_index(argv[1], 10000); + // Create an inverted index based on the config file. + auto idx = index::make_index(argv[1]); auto config = cpptoml::parse_file(argv[1]); diff --git a/src/test/classifier_test.cpp b/src/test/classifier_test.cpp index df5cc50ee..f9457ce95 100644 --- a/src/test/classifier_test.cpp +++ b/src/test/classifier_test.cpp @@ -51,8 +51,7 @@ int run_tests(const std::string& type) // other filesystems that might lock opened files { auto i_idx - = index::make_index( - "test-config.toml"); + = index::make_index("test-config.toml"); auto f_idx = index::make_index( "test-config.toml"); diff --git a/src/test/inverted_index_test.cpp b/src/test/inverted_index_test.cpp index e615994fa..f8e1da375 100644 --- a/src/test/inverted_index_test.cpp +++ b/src/test/inverted_index_test.cpp @@ -106,98 +106,102 @@ int inverted_index_tests() create_config("file"); int num_failed = 0; - num_failed += testing::run_test("inverted-index-build-file-corpus", [&]() - { - system("rm -rf ceeaus-inv"); - auto idx - = index::make_index( - "test-config.toml", uint32_t{10000}); - check_ceeaus_expected(*idx); - }); - - num_failed += testing::run_test("inverted-index-read-file-corpus", [&]() - { + num_failed += testing::run_test( + "inverted-index-build-file-corpus", [&]() { - auto idx = index::make_index( - "test-config.toml", uint32_t{10000}); + system("rm -rf ceeaus-inv"); + auto idx + = index::make_index("test-config.toml"); check_ceeaus_expected(*idx); - check_term_id(*idx); - } - system("rm -rf ceeaus-inv test-config.toml"); - }); + }); + + num_failed += testing::run_test( + "inverted-index-read-file-corpus", [&]() + { + { + auto idx = index::make_index( + "test-config.toml"); + check_ceeaus_expected(*idx); + check_term_id(*idx); + } + system("rm -rf ceeaus-inv test-config.toml"); + }); create_config("line"); system("rm -rf ceeaus-inv"); - num_failed += testing::run_test("inverted-index-build-line-corpus", [&]() - { - auto idx - = index::make_index( - "test-config.toml", uint32_t{10000}); - check_ceeaus_expected(*idx); - }); + num_failed += testing::run_test( + "inverted-index-build-line-corpus", [&]() + { + auto idx + = index::make_index("test-config.toml"); + check_ceeaus_expected(*idx); + }); - num_failed += testing::run_test("inverted-index-read-line-corpus", [&]() - { - auto idx - = index::make_index( + num_failed += testing::run_test( + "inverted-index-read-line-corpus", [&]() + { + auto idx = index::make_index( "test-config.toml", uint32_t{10000}); - check_ceeaus_expected(*idx); - check_term_id(*idx); - check_term_id(*idx); // twice to check splay_caching - }); + check_ceeaus_expected(*idx); + check_term_id(*idx); + check_term_id(*idx); // twice to check splay_caching + }); #if META_HAS_ZLIB create_config("gz"); system("rm -rf ceeaus-inv"); - num_failed += testing::run_test("inverted-index-build-gz-corpus", [&]() - { - auto idx - = index::make_index( - "test-config.toml", 10000); - check_ceeaus_expected(*idx); - }); - - num_failed += testing::run_test("inverted-index-read-gz-corpus", [&]() - { - auto idx - = index::make_index( - "test-config.toml", 10000); - check_ceeaus_expected(*idx); - check_term_id(*idx); - }); + num_failed += testing::run_test( + "inverted-index-build-gz-corpus", [&]() + { + auto idx + = index::make_index("test-config.toml"); + check_ceeaus_expected(*idx); + }); + + num_failed += testing::run_test( + "inverted-index-read-gz-corpus", [&]() + { + auto idx + = index::make_index("test-config.toml"); + check_ceeaus_expected(*idx); + check_term_id(*idx); + }); #endif // test different caches - num_failed += testing::run_test("inverted-index-dblru-cache", [&]() - { - auto idx = index::make_index( - "test-config.toml", uint64_t{1000}); - check_term_id(*idx); - check_term_id(*idx); - }); - - num_failed += testing::run_test("inverted-index-no-evict-cache", [&]() - { - auto idx - = index::make_index( + num_failed += testing::run_test( + "inverted-index-dblru-cache", [&]() + { + auto idx = index::make_index( + "test-config.toml", uint64_t{1000}); + check_term_id(*idx); + check_term_id(*idx); + }); + + num_failed += testing::run_test( + "inverted-index-no-evict-cache", [&]() + { + auto idx = index::make_index( "test-config.toml"); - check_term_id(*idx); - check_term_id(*idx); - }); - - num_failed += testing::run_test("inverted-index-shard-cache", [&]() - { - auto idx = index::make_index( - "test-config.toml", uint8_t{8}); - check_term_id(*idx); - check_term_id(*idx); - }); + check_term_id(*idx); + check_term_id(*idx); + }); + + num_failed += testing::run_test( + "inverted-index-shard-cache", [&]() + { + auto idx = index::make_index( + "test-config.toml", uint8_t{8}); + check_term_id(*idx); + check_term_id(*idx); + }); system("rm -rf ceeaus-inv test-config.toml"); return num_failed; diff --git a/src/test/ir_eval_test.cpp b/src/test/ir_eval_test.cpp index c04b674f8..b4a885eed 100644 --- a/src/test/ir_eval_test.cpp +++ b/src/test/ir_eval_test.cpp @@ -36,9 +36,8 @@ int ir_eval_bounds() { system("rm -rf ceeaus-inv"); create_config("file"); - auto idx = index::make_index( - "test-config.toml", uint32_t{10000}); + auto idx + = index::make_index("test-config.toml"); index::okapi_bm25 ranker; index::ir_eval eval{"test-config.toml"}; // sanity test bounds diff --git a/src/test/ranker_test.cpp b/src/test/ranker_test.cpp index 0c6ff9791..df43a8055 100644 --- a/src/test/ranker_test.cpp +++ b/src/test/ranker_test.cpp @@ -39,8 +39,7 @@ int ranker_tests() { create_config("file"); system("rm -rf ceeaus-inv"); - auto idx = index::make_index( - "test-config.toml", uint32_t{10000}); + auto idx = index::make_index("test-config.toml"); auto config = cpptoml::parse_file("test-config.toml"); std::string encoding = "utf-8"; From 05d720eebb8cf64dc0eed96f5704b4828d6f7493 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 22 Apr 2015 00:58:30 -0500 Subject: [PATCH 102/481] Use packed binary format for intermediate chunks. This makes indexing a lot faster by speeding up the merge step considerably, and has the nice side-effect of making the intermediate chunks quite a bit smaller than they were before. --- include/index/chunk.tcc | 61 ++++++------ include/index/chunk_handler.tcc | 11 +-- include/index/postings_data.h | 123 ++++-------------------- include/index/postings_data.tcc | 137 +++++++++++++++------------ include/index/postings_file_writer.h | 28 +----- src/index/forward_index.cpp | 15 +-- src/index/inverted_index.cpp | 14 +-- 7 files changed, 144 insertions(+), 245 deletions(-) diff --git a/include/index/chunk.tcc b/include/index/chunk.tcc index 88f9a6e5e..91e218452 100644 --- a/include/index/chunk.tcc +++ b/include/index/chunk.tcc @@ -6,8 +6,6 @@ #include "index/chunk.h" #include "index/postings_data.h" -#include "io/compressed_file_reader.h" -#include "io/compressed_file_writer.h" #include "util/filesystem.h" namespace meta @@ -52,17 +50,14 @@ void chunk::merge_with(const chunk& other) { std::string temp_name = path_ + "_merge"; - io::compressed_file_reader my_data{path_, - io::default_compression_reader_func}; - io::compressed_file_reader other_data{other.path_, - io::default_compression_reader_func}; - io::compressed_file_writer output{temp_name, - io::default_compression_writer_func}; + std::ifstream my_data{path_, std::ios::binary}; + std::ifstream other_data{other.path_, std::ios::binary}; + std::ofstream output{temp_name, std::ios::binary}; postings_data my_pd; postings_data other_pd; - my_data >> my_pd; - other_data >> other_pd; + my_pd.read_packed(my_data); + other_pd.read_packed(other_data); uint64_t terms = 0; // merge while both have postings data @@ -74,25 +69,25 @@ void chunk::merge_with(const chunk& other) // merge my_pd.merge_with(other_pd); // write - output << my_pd; + my_pd.write_packed(output); // read next two postings data - my_data >> my_pd; - other_data >> other_pd; + my_pd.read_packed(my_data); + other_pd.read_packed(other_data); } else if (my_pd.primary_key() < other_pd.primary_key()) { // write the winner - output << my_pd; + my_pd.write_packed(output); // read next from the current chunk - my_data >> my_pd; + my_pd.read_packed(my_data); } else { // write the winner - output << other_pd; + other_pd.write_packed(output); // read next from the other chunk - other_data >> other_pd; + other_pd.read_packed(other_data); } } @@ -100,14 +95,14 @@ void chunk::merge_with(const chunk& other) while (my_data) { ++terms; - output << my_pd; - my_data >> my_pd; + my_pd.write_packed(output); + my_pd.read_packed(my_data); } while (other_data) { ++terms; - output << other_pd; - other_data >> other_pd; + other_pd.write_packed(output); + other_pd.read_packed(other_data); } my_data.close(); @@ -130,13 +125,11 @@ void chunk::memory_merge_with(Container& pdata) { std::string temp_name = path_ + "_merge"; - io::compressed_file_reader my_data{path_, - io::default_compression_reader_func}; - io::compressed_file_writer output{temp_name, - io::default_compression_writer_func}; + std::ifstream my_data{path_, std::ios::binary}; + std::ofstream output{temp_name, std::ios::binary}; postings_data my_pd; - my_data >> my_pd; + my_pd.read_packed(my_data); auto other_pd = pdata.begin(); uint64_t terms = 0; @@ -146,18 +139,18 @@ void chunk::memory_merge_with(Container& pdata) if (my_pd.primary_key() == other_pd->primary_key()) { my_pd.merge_with(*other_pd); - output << my_pd; - my_data >> my_pd; + my_pd.write_packed(output); + my_pd.read_packed(my_data); ++other_pd; } else if (my_pd.primary_key() < other_pd->primary_key()) { - output << my_pd; - my_data >> my_pd; + my_pd.write_packed(output); + my_pd.read_packed(my_data); } else { - output << *other_pd; + other_pd->write_packed(output); ++other_pd; } } @@ -166,13 +159,13 @@ void chunk::memory_merge_with(Container& pdata) while (my_data) { ++terms; - output << my_pd; - my_data >> my_pd; + my_pd.write_packed(output); + my_pd.read_packed(my_data); } while (other_pd != pdata.end()) { ++terms; - output << *other_pd; + other_pd->write_packed(output); ++other_pd; } diff --git a/include/index/chunk_handler.tcc b/include/index/chunk_handler.tcc index 50aa04acd..ec92f035e 100644 --- a/include/index/chunk_handler.tcc +++ b/include/index/chunk_handler.tcc @@ -105,12 +105,11 @@ void chunk_handler::write_chunk(std::vector& pdata) { std::string chunk_name = prefix_ + "/chunk-" + std::to_string(chunk_num); - io::compressed_file_writer outfile{chunk_name, - io::default_compression_writer_func}; - for (auto& p : pdata) - outfile << p; - - outfile.close(); // close so we can read the file size in chunk ctr + { + std::ofstream outfile{chunk_name, std::ios::binary}; + for (auto& p : pdata) + p.write_packed(outfile); + } std::ofstream termfile{chunk_name + ".numterms"}; termfile << pdata.size(); pdata.clear(); diff --git a/include/index/postings_data.h b/include/index/postings_data.h index 0e19a5678..861913f33 100644 --- a/include/index/postings_data.h +++ b/include/index/postings_data.h @@ -25,14 +25,6 @@ namespace meta namespace index { -template -class postings_data; - -template -io::compressed_file_reader& operator>>(io::compressed_file_reader&, - postings_data&); - /** * A class to represent the per-PrimaryKey data in an index's postings * file. For a given PrimaryKey, a mapping of SecondaryKey -> count information @@ -129,77 +121,32 @@ class postings_data bool operator<(const postings_data& other) const; /** - * Helper function used by istream operator. - * @param in The stream to read from - * @param pd The postings data object to write the stream info to - */ - friend void stream_helper(io::compressed_file_reader& in, - postings_data& pd) - { - pd.counts_.clear(); - uint32_t num_pairs = in.next(); - for (uint32_t i = 0; i < num_pairs; ++i) - { - SecondaryKey s_id = SecondaryKey{in.next()}; - uint64_t count = in.next(); - pd.counts_.emplace_back(s_id, static_cast(count)); - } - } - - /** - * Reads semi-compressed postings data from a compressed file. - * @param in The stream to read from - * @param pd The postings data object to write the stream info to - * @return the input stream - */ - friend io::compressed_file_reader& operator>> - <>(io::compressed_file_reader& in, - postings_data& pd); - - /** - * Writes semi-compressed postings data to a compressed file. + * Writes this postings data to an output stream in a packed binary + * format. * @param out The stream to write to - * @param pd The postings data object to write to the stream - * @return the output stream + * @return the number of bytes used to write out this postings data */ - friend io::compressed_file_writer& operator<<( - io::compressed_file_writer& out, - const postings_data& pd) - { - if (pd.counts_.empty()) - return out; - - out.write(pd.p_id_); - uint64_t size = pd.counts_.size(); - out.write(size); - for (auto& p : pd.counts_) - { - out.write(p.first); - out.write(static_cast(p.second)); - } - - return out; - } + template + uint64_t write_packed(std::ostream& out) const; /** - * Writes this postings_data to a compressed file. The mapping for the - * compressed file is already set, so we don't have to worry about it. - * We can also assume that we are already in the correct location of the - * file. - * @param writer The compressed file to write to + * Writes this postings data's counts to an output stream in a packed + * binary format. + * @param out The stream to write to + * @return the number of bytes used to write out this postings data's + * counts */ - template - void write_compressed(io::compressed_file_writer& writer) const; + template + uint64_t write_packed_counts(std::ostream& out) const; /** - * Reads compressed postings_data into this object. The mapping for the - * compressed file is already set, so we don't have to worry about it. - * We can also assume that we are already in the correct location of the - * file. - * @param reader The compressed file to read from + * Reads a postings data object from an input stream in a packed binary + * format. + * @param in The stream to read from + * @return the number of bytes read in consuming this postings data */ - template - void read_compressed(io::compressed_file_reader& reader); + template + uint64_t read_packed(std::istream& in); /** * @return the term_id for this postings_data @@ -227,42 +174,8 @@ class postings_data /// The (secondary_key_type, count) pairs util::sparse_vector counts_; - - /// delimiter used when writing to compressed files - const static uint64_t delimiter_ = std::numeric_limits::max(); }; -/** - * Reads semi-compressed postings data from a compressed file. - * @param in The stream to read from - * @param pd The postings data object to write the stream info to - * @return the input stream - */ -template -io::compressed_file_reader& operator>>(io::compressed_file_reader& in, - postings_data& pd) -{ - pd.p_id_ = in.next(); - stream_helper(in, pd); - return in; -} - -/** - * Reads semi-compressed postings data from a compressed file. - * @param in The stream to read from - * @param pd The postings data object to write the stream info to - * @return the input stream - */ -template <> -inline io::compressed_file_reader& operator>> - <>(io::compressed_file_reader& in, postings_data& pd) -{ - pd.p_id_ = in.next_string(); - stream_helper(in, pd); - return in; -} - /** * @param lhs The first postings_data * @param rhs The postings_data to compare with diff --git a/include/index/postings_data.tcc b/include/index/postings_data.tcc index a42e7f8f1..01807ee03 100644 --- a/include/index/postings_data.tcc +++ b/include/index/postings_data.tcc @@ -7,6 +7,8 @@ #include #include #include "index/postings_data.h" +#include "io/binary.h" +#include "io/packed.h" namespace meta { @@ -118,86 +120,52 @@ PrimaryKey postings_data::primary_key() const template template -void postings_data::write_compressed( - io::compressed_file_writer& writer) const -{ - writer.write(counts_.size()); - writer.write(std::accumulate(counts_.begin(), counts_.end(), uint64_t{0}, - [](uint64_t cur, const pair_t& pr) - { - return cur - + static_cast(pr.second); - })); - count_t mutable_counts{counts_.contents()}; - writer.write(mutable_counts[0].first); - if (std::is_same::value) - { - writer.write(static_cast(mutable_counts[0].second)); - } - else - { - writer.write(mutable_counts[0].second); - } +uint64_t postings_data::write_packed( + std::ostream& out) const +{ + uint64_t bytes = 0; - // use gap encoding on the SecondaryKeys (we know they are integral types) - uint64_t cur_id = mutable_counts[0].first; - for (size_t i = 1; i < mutable_counts.size(); ++i) - { - uint64_t temp_id = mutable_counts[i].first; - mutable_counts[i].first = mutable_counts[i].first - cur_id; - cur_id = temp_id; + if (std::is_same::value) + bytes += io::write_binary(out, p_id_); - writer.write(mutable_counts[i].first); - if (std::is_same::value) - { - writer.write(static_cast(mutable_counts[i].second)); - } - else - { - writer.write(mutable_counts[i].second); - } - } + bytes += write_packed_counts(out); + + return bytes; } template template -void postings_data::read_compressed( - io::compressed_file_reader& reader) +uint64_t postings_data::write_packed_counts(std::ostream& out) const { - uint64_t size = reader.next(); + auto bytes = io::packed::write(out, counts_.size()); - // ignore total counts sum - reader.next(); - - counts_.clear(); - counts_.reserve(size); + auto total_counts + = std::accumulate(counts_.begin(), counts_.end(), uint64_t{0}, + [](uint64_t cur, const pair_t& pr) + { + return cur + static_cast(pr.second); + }); + bytes += io::packed::write(out, total_counts); uint64_t last_id = 0; - - for (uint64_t i = 0; i < size; ++i) + for (const auto& count : counts_) { - uint64_t this_id = reader.next(); - // we're using gap encoding - last_id += this_id; - SecondaryKey key{last_id}; + bytes += io::packed::write(out, count.first - last_id); - double count; if (std::is_same::value) { - uint64_t next = reader.next(); - count = static_cast(next); + bytes + += io::packed::write(out, static_cast(count.second)); } else { - count = reader.next_double(); + bytes += io::packed::write(out, count.second); } - counts_.emplace_back(key, count); + last_id = count.first; } - // compress vector to conserve memory (it shouldn't be modified again after - // this) - counts_.shrink_to_fit(); + return bytes; } namespace @@ -219,6 +187,57 @@ uint64_t length(const T& elem, } } +template +template +uint64_t postings_data::read_packed(std::istream& in) +{ + if (in.get() == EOF) + return 0; + else + in.unget(); + + uint64_t bytes = 0; + if (std::is_same::value) + { + io::read_binary(in, p_id_); + bytes += length(p_id_); + } + + uint64_t size; + uint64_t total_counts; + + bytes += io::packed::read(in, size); + bytes += io::packed::read(in, total_counts); + + counts_.clear(); + counts_.reserve(size); + + SecondaryKey id{0}; + for (uint64_t i = 0; i < size; ++i) + { + // gap encoding + uint64_t gap; + bytes += io::packed::read(in, gap); + id += gap; + + double count; + if (std::is_same::value) + { + uint64_t next; + bytes += io::packed::read(in, next); + count = static_cast(next); + } + else + { + bytes += io::packed::read(in, count); + } + + counts_.emplace_back(id, count); + } + + return bytes; +} + template uint64_t postings_data::bytes_used() const { diff --git a/include/index/postings_file_writer.h b/include/index/postings_file_writer.h index 066560ba5..016a592ee 100644 --- a/include/index/postings_file_writer.h +++ b/include/index/postings_file_writer.h @@ -44,33 +44,7 @@ class postings_file_writer void write(const PostingsData& pdata) { byte_locations_[id_] = byte_pos_; - byte_pos_ += io::packed::write(output_, pdata.counts().size()); - - auto total_counts = std::accumulate( - pdata.counts().begin(), pdata.counts().end(), uint64_t{0}, - [](uint64_t cur, const typename PostingsData::pair_t& pr) - { - return cur + static_cast(pr.second); - }); - byte_pos_ += io::packed::write(output_, total_counts); - - uint64_t last_id = 0; - for (const auto& count : pdata.counts()) - { - byte_pos_ += io::packed::write(output_, count.first - last_id); - - if (std::is_same::value) - { - byte_pos_ += io::packed::write( - output_, static_cast(count.second)); - } - else - { - byte_pos_ += io::packed::write(output_, count.second); - } - - last_id = count.first; - } + byte_pos_ += pdata.template write_packed_counts(output_); ++id_; } diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index d27636137..bf4df97cb 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -340,18 +340,19 @@ void forward_index::impl::compress(const std::string& filename, postings_file_writer out{filename, num_docs}; forward_index::postings_data_type pdata; - auto length = filesystem::file_size(ucfilename) * 8; // number of bits - io::compressed_file_reader in{ucfilename, - io::default_compression_reader_func}; + auto length = filesystem::file_size(ucfilename); + + std::ifstream in{ucfilename, std::ios::binary}; + uint64_t byte_pos = 0; printing::progress progress{ - " > Compressing postings: ", length, 500, 8 * 1024 /* 1KB */ + " > Compressing postings: ", length, 500, 1024 /* 1KB */ }; // note: we will be accessing pdata in sorted order - while (in.has_next()) + while (auto bytes = pdata.read_packed(in)) { - in >> pdata; - progress(in.bit_location()); + byte_pos += bytes; + progress(byte_pos); out.write(pdata); } } diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index d43ea283e..9c021b66e 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -230,18 +230,18 @@ void inverted_index::impl::compress(const std::string& filename, + idx_->impl_->files[TERM_IDS_MAPPING]}; postings_data pdata; - auto length = filesystem::file_size(ucfilename) * 8; // number of bits - io::compressed_file_reader in{ucfilename, - io::default_compression_reader_func}; + auto length = filesystem::file_size(ucfilename); + std::ifstream in{ucfilename, std::ios::binary}; + uint64_t byte_pos = 0; printing::progress progress{ - " > Compressing postings: ", length, 500, 8 * 1024 /* 1KB */ + " > Compressing postings: ", length, 500, 1024 /* 1KB */ }; // note: we will be accessing pdata in sorted order - while (in.has_next()) + while (auto bytes = pdata.read_packed(in)) { - in >> pdata; - progress(in.bit_location()); + byte_pos += bytes; + progress(byte_pos); vocab.insert(pdata.primary_key()); out.write(pdata); } From d920906af25652ba3d4761eb94eae03d3935ce3c Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 22 Apr 2015 01:03:14 -0500 Subject: [PATCH 103/481] Make travis build in both debug and release mode. The unit tests are only run for the release configuration. --- .travis.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index dcfc465d6..ee9872f08 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,6 +55,5 @@ before_script: - cp ../config.toml ./ script: - - cmake ../ -DCMAKE_BUILD_TYPE=Debug - - make - - ctest --output-on-failure + - cmake ../ -DCMAKE_BUILD_TYPE=Debug && make && make clean + - rm -rf CMake* && cmake ../ -DCMAKE_BUILD_TYPE=Release && make && ctest --output-on-failure From 0b2082bbabe544d8defd35f417dcea26397892eb Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 22 Apr 2015 11:20:24 -0500 Subject: [PATCH 104/481] use floats instead of doubles for scoring --- include/classify/classifier/knn.h | 2 +- include/index/eval/ir_eval.h | 2 +- include/index/ranker/absolute_discount.h | 8 ++++---- include/index/ranker/dirichlet_prior.h | 10 +++++----- include/index/ranker/jelinek_mercer.h | 10 +++++----- include/index/ranker/lm_ranker.h | 8 ++++---- include/index/ranker/okapi_bm25.h | 18 +++++++++--------- include/index/ranker/pivoted_length.h | 8 ++++---- include/index/ranker/ranker.h | 6 +++--- src/classify/classifier/knn.cpp | 2 +- src/index/eval/ir_eval.cpp | 14 +++++++------- src/index/ranker/absolute_discount.cpp | 14 +++++++------- src/index/ranker/dirichlet_prior.cpp | 12 ++++++------ src/index/ranker/jelinek_mercer.cpp | 10 +++++----- src/index/ranker/lm_ranker.cpp | 8 ++++---- src/index/ranker/okapi_bm25.cpp | 12 ++++++------ src/index/ranker/pivoted_length.cpp | 12 ++++++------ src/index/ranker/ranker.cpp | 13 ++++++------- src/index/tools/interactive-search.cpp | 2 +- src/test/ir_eval_test.cpp | 4 ++-- 20 files changed, 87 insertions(+), 88 deletions(-) diff --git a/include/classify/classifier/knn.h b/include/classify/classifier/knn.h index 0b8288b01..24012e0a8 100644 --- a/include/classify/classifier/knn.h +++ b/include/classify/classifier/knn.h @@ -69,7 +69,7 @@ class knn : public classifier * @return the best label */ class_label select_best_label( - const std::vector>& scored, + const std::vector>& scored, const std::vector>& sorted) const; /** the inverted index used for ranking */ diff --git a/include/index/eval/ir_eval.h b/include/index/eval/ir_eval.h index 88041cc78..85da1fc3a 100644 --- a/include/index/eval/ir_eval.h +++ b/include/index/eval/ir_eval.h @@ -30,7 +30,7 @@ namespace index class ir_eval { public: - using result_type = std::vector>; + using result_type = std::vector>; /** * @param config_file Path to cpptoml configuration file diff --git a/include/index/ranker/absolute_discount.h b/include/index/ranker/absolute_discount.h index 3c3104db6..73781e013 100644 --- a/include/index/ranker/absolute_discount.h +++ b/include/index/ranker/absolute_discount.h @@ -32,23 +32,23 @@ class absolute_discount : public language_model_ranker /** * @param delta */ - absolute_discount(double delta = 0.7); + absolute_discount(float delta = 0.7); /** * Calculates the smoothed probability of a term. * @param sd score_data for the current query */ - double smoothed_prob(const score_data& sd) const override; + float smoothed_prob(const score_data& sd) const override; /** * A document-dependent constant. * @param sd score_data for the current query */ - double doc_constant(const score_data& sd) const override; + float doc_constant(const score_data& sd) const override; private: /// the absolute discounting parameter - const double delta_; + const float delta_; }; /** diff --git a/include/index/ranker/dirichlet_prior.h b/include/index/ranker/dirichlet_prior.h index 39ff10773..e5b496b6f 100644 --- a/include/index/ranker/dirichlet_prior.h +++ b/include/index/ranker/dirichlet_prior.h @@ -27,28 +27,28 @@ class dirichlet_prior : public language_model_ranker const static std::string id; /// Default value of mu - const static constexpr double default_mu = 2000; + const static constexpr float default_mu = 2000; /** * @param mu */ - dirichlet_prior(double mu = default_mu); + dirichlet_prior(float mu = default_mu); /** * Calculates the smoothed probability of a term. * @param sd score_data for the current query */ - double smoothed_prob(const score_data& sd) const override; + float smoothed_prob(const score_data& sd) const override; /** * A document-dependent constant. * @param sd score_data for the current query */ - double doc_constant(const score_data& sd) const override; + float doc_constant(const score_data& sd) const override; private: /// the Dirichlet prior parameter - const double mu_; + const float mu_; }; /** diff --git a/include/index/ranker/jelinek_mercer.h b/include/index/ranker/jelinek_mercer.h index 36c07de9d..f1dcf32c3 100644 --- a/include/index/ranker/jelinek_mercer.h +++ b/include/index/ranker/jelinek_mercer.h @@ -30,28 +30,28 @@ class jelinek_mercer : public language_model_ranker const static std::string id; /// Default value of lambda - const static constexpr double default_lambda = 0.7; + const static constexpr float default_lambda = 0.7; /** * @param lambda */ - jelinek_mercer(double lambda = default_lambda); + jelinek_mercer(float lambda = default_lambda); /** * Calculates the smoothed probability of a term. * @param sd */ - double smoothed_prob(const score_data& sd) const override; + float smoothed_prob(const score_data& sd) const override; /** * A document-dependent constant. * @param sd */ - double doc_constant(const score_data& sd) const override; + float doc_constant(const score_data& sd) const override; private: /// the JM parameter - const double lambda_; + const float lambda_; }; /** diff --git a/include/index/ranker/lm_ranker.h b/include/index/ranker/lm_ranker.h index 111b3d65f..71ff86170 100644 --- a/include/index/ranker/lm_ranker.h +++ b/include/index/ranker/lm_ranker.h @@ -30,21 +30,21 @@ class language_model_ranker : public ranker /** * @param sd */ - double score_one(const score_data& sd) override; + float score_one(const score_data& sd) override; - double initial_score(const score_data& sd) const override; + float initial_score(const score_data& sd) const override; /** * Calculates the smoothed probability of a term. * @param sd */ - virtual double smoothed_prob(const score_data& sd) const = 0; + virtual float smoothed_prob(const score_data& sd) const = 0; /** * A document-dependent constant. * @param sd */ - virtual double doc_constant(const score_data& sd) const = 0; + virtual float doc_constant(const score_data& sd) const = 0; /** * Default destructor. diff --git a/include/index/ranker/okapi_bm25.h b/include/index/ranker/okapi_bm25.h index 85111b865..ebbf20502 100644 --- a/include/index/ranker/okapi_bm25.h +++ b/include/index/ranker/okapi_bm25.h @@ -27,34 +27,34 @@ class okapi_bm25 : public ranker const static std::string id; /// Default k1, doc term smoothing - const static constexpr double default_k1 = 1.2; + const static constexpr float default_k1 = 1.2; /// Default b, length normalization - const static constexpr double default_b = 0.75; + const static constexpr float default_b = 0.75; /// Default k3, query term smoothing - const static constexpr double default_k3 = 500.0; + const static constexpr float default_k3 = 500.0; /** * @param k1 Doc term smoothing * @param b Length normalization * @param k3 Query term smoothing */ - okapi_bm25(double k1 = default_k1, double b = default_b, - double k3 = default_k3); + okapi_bm25(float k1 = default_k1, float b = default_b, + float k3 = default_k3); /** * @param sd score_data for the current query */ - double score_one(const score_data& sd) override; + float score_one(const score_data& sd) override; private: /// Doc term smoothing - const double k1_; + const float k1_; /// Length normalization - const double b_; + const float b_; /// Query term smoothing - const double k3_; + const float k3_; }; /** diff --git a/include/index/ranker/pivoted_length.h b/include/index/ranker/pivoted_length.h index f2205bc75..5c6a194a5 100644 --- a/include/index/ranker/pivoted_length.h +++ b/include/index/ranker/pivoted_length.h @@ -29,21 +29,21 @@ class pivoted_length : public ranker const static std::string id; /// Default value of s parameter - const static constexpr double default_s = 0.20; + const static constexpr float default_s = 0.20; /** * @param s */ - pivoted_length(double s = default_s); + pivoted_length(float s = default_s); /** * @param sd the score_data for this query */ - double score_one(const score_data& sd) override; + float score_one(const score_data& sd) override; private: /// s parameter for pivoted_length normalization - const double s_; + const float s_; }; /** diff --git a/include/index/ranker/ranker.h b/include/index/ranker/ranker.h index bde75b6a5..faaabf235 100644 --- a/include/index/ranker/ranker.h +++ b/include/index/ranker/ranker.h @@ -48,7 +48,7 @@ class ranker * @param filter A filtering function to apply to each doc_id; returns true * if the document should be included in results */ - std::vector> + std::vector> score(inverted_index& idx, corpus::document& query, uint64_t num_results = 10, const std::function& filter = [](doc_id) @@ -61,14 +61,14 @@ class ranker * query term. * @param sd The score_data for this query */ - virtual double score_one(const score_data& sd) = 0; + virtual float score_one(const score_data& sd) = 0; /** * Computes the constant contribution to the score of a particular * document. * @param sd The score_data for the query */ - virtual double initial_score(const score_data& sd) const; + virtual float initial_score(const score_data& sd) const; /** * Default destructor. diff --git a/src/classify/classifier/knn.cpp b/src/classify/classifier/knn.cpp index db926e019..7d8b40748 100644 --- a/src/classify/classifier/knn.cpp +++ b/src/classify/classifier/knn.cpp @@ -78,7 +78,7 @@ class_label knn::classify(doc_id d_id) } class_label knn::select_best_label( - const std::vector>& scored, + const std::vector>& scored, const std::vector>& sorted) const { uint16_t highest = sorted.begin()->second; diff --git a/src/index/eval/ir_eval.cpp b/src/index/eval/ir_eval.cpp index 1755334c6..c3a5856b1 100644 --- a/src/index/eval/ir_eval.cpp +++ b/src/index/eval/ir_eval.cpp @@ -61,7 +61,7 @@ void ir_eval::init_index(const std::string& path) } } -double ir_eval::precision(const std::vector>& results, +double ir_eval::precision(const std::vector>& results, query_id q_id, uint64_t num_docs) const { if (results.empty()) @@ -75,7 +75,7 @@ double ir_eval::precision(const std::vector>& results, return relevant_retrieved(results, q_id, num_docs) / denominator; } -double ir_eval::recall(const std::vector>& results, +double ir_eval::recall(const std::vector>& results, query_id q_id, uint64_t num_docs) const { if (results.empty()) @@ -89,7 +89,7 @@ double ir_eval::recall(const std::vector>& results, } double ir_eval::relevant_retrieved(const std::vector - >& results, + >& results, query_id q_id, uint64_t num_docs) const { double rel = 0.0; @@ -106,7 +106,7 @@ double ir_eval::relevant_retrieved(const std::vector return rel; } -double ir_eval::f1(const std::vector>& results, +double ir_eval::f1(const std::vector>& results, query_id q_id, uint64_t num_docs, double beta) const { double p = precision(results, q_id, num_docs); @@ -120,7 +120,7 @@ double ir_eval::f1(const std::vector>& results, return numerator / denominator; } -double ir_eval::ndcg(const std::vector>& results, +double ir_eval::ndcg(const std::vector>& results, query_id q_id, uint64_t num_docs) const { // find this query's judgements @@ -157,7 +157,7 @@ double ir_eval::ndcg(const std::vector>& results, return dcg / idcg; } -double ir_eval::avg_p(const std::vector>& results, +double ir_eval::avg_p(const std::vector>& results, query_id q_id, uint64_t num_docs) { const auto& ht = qrels_.find(q_id); @@ -211,7 +211,7 @@ double ir_eval::gmap() const return std::exp(sum / scores_.size()); } -void ir_eval::print_stats(const std::vector>& results, +void ir_eval::print_stats(const std::vector>& results, query_id q_id, std::ostream& out) { auto w1 = std::setw(8); diff --git a/src/index/ranker/absolute_discount.cpp b/src/index/ranker/absolute_discount.cpp index cc88ba201..2c4706dd8 100644 --- a/src/index/ranker/absolute_discount.cpp +++ b/src/index/ranker/absolute_discount.cpp @@ -15,22 +15,22 @@ namespace index const std::string absolute_discount::id = "absolute-discount"; -absolute_discount::absolute_discount(double delta) : delta_{delta} +absolute_discount::absolute_discount(float delta) : delta_{delta} { /* nothing */ } -double absolute_discount::smoothed_prob(const score_data& sd) const +float absolute_discount::smoothed_prob(const score_data& sd) const { - double pc = static_cast(sd.corpus_term_count) / sd.total_terms; - double numerator = std::max(sd.doc_term_count - delta_, 0); - double denominator = sd.doc_size; + float pc = static_cast(sd.corpus_term_count) / sd.total_terms; + float numerator = std::max(sd.doc_term_count - delta_, 0); + float denominator = sd.doc_size; return numerator / denominator + doc_constant(sd) * pc; } -double absolute_discount::doc_constant(const score_data& sd) const +float absolute_discount::doc_constant(const score_data& sd) const { - double unique = sd.doc_unique_terms; + float unique = sd.doc_unique_terms; return delta_ * unique / sd.doc_size; } diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp index b9e028897..360808174 100644 --- a/src/index/ranker/dirichlet_prior.cpp +++ b/src/index/ranker/dirichlet_prior.cpp @@ -14,20 +14,20 @@ namespace index const std::string dirichlet_prior::id = "dirichlet-prior"; -dirichlet_prior::dirichlet_prior(double mu) : mu_{mu} +dirichlet_prior::dirichlet_prior(float mu) : mu_{mu} { /* nothing */ } -double dirichlet_prior::smoothed_prob(const score_data& sd) const +float dirichlet_prior::smoothed_prob(const score_data& sd) const { - double pc = static_cast(sd.corpus_term_count) / sd.total_terms; - double numerator = sd.doc_term_count + mu_ * pc; - double denominator = sd.doc_size + mu_; + float pc = static_cast(sd.corpus_term_count) / sd.total_terms; + float numerator = sd.doc_term_count + mu_ * pc; + float denominator = sd.doc_size + mu_; return numerator / denominator; } -double dirichlet_prior::doc_constant(const score_data& sd) const +float dirichlet_prior::doc_constant(const score_data& sd) const { return mu_ / (sd.doc_size + mu_); } diff --git a/src/index/ranker/jelinek_mercer.cpp b/src/index/ranker/jelinek_mercer.cpp index 9d0b98fb4..a5e1c534a 100644 --- a/src/index/ranker/jelinek_mercer.cpp +++ b/src/index/ranker/jelinek_mercer.cpp @@ -14,20 +14,20 @@ namespace index const std::string jelinek_mercer::id = "jelinek-mercer"; -jelinek_mercer::jelinek_mercer(double lambda) : lambda_{lambda} +jelinek_mercer::jelinek_mercer(float lambda) : lambda_{lambda} { /* nothing */ } -double jelinek_mercer::smoothed_prob(const score_data& sd) const +float jelinek_mercer::smoothed_prob(const score_data& sd) const { - double max_likelihood = static_cast(sd.doc_term_count) + float max_likelihood = static_cast(sd.doc_term_count) / sd.doc_size; - double pc = static_cast(sd.corpus_term_count) / sd.total_terms; + float pc = static_cast(sd.corpus_term_count) / sd.total_terms; return (1.0 - lambda_) * max_likelihood + lambda_ * pc; } -double jelinek_mercer::doc_constant(const score_data&) const +float jelinek_mercer::doc_constant(const score_data&) const { return lambda_; } diff --git a/src/index/ranker/lm_ranker.cpp b/src/index/ranker/lm_ranker.cpp index 755a785b9..8f09f5ac2 100644 --- a/src/index/ranker/lm_ranker.cpp +++ b/src/index/ranker/lm_ranker.cpp @@ -16,15 +16,15 @@ namespace index const std::string language_model_ranker::id = "language-model"; -double language_model_ranker::score_one(const score_data& sd) +float language_model_ranker::score_one(const score_data& sd) { - double ps = smoothed_prob(sd); - double pc = static_cast(sd.corpus_term_count) / sd.total_terms; + float ps = smoothed_prob(sd); + float pc = static_cast(sd.corpus_term_count) / sd.total_terms; return sd.query_term_count * fastapprox::fasterlog(ps / (doc_constant(sd) * pc)); } -double language_model_ranker::initial_score(const score_data& sd) const +float language_model_ranker::initial_score(const score_data& sd) const { return sd.query.length() * fastapprox::fasterlog(doc_constant(sd)); } diff --git a/src/index/ranker/okapi_bm25.cpp b/src/index/ranker/okapi_bm25.cpp index 325d324e1..331a6ce91 100644 --- a/src/index/ranker/okapi_bm25.cpp +++ b/src/index/ranker/okapi_bm25.cpp @@ -16,24 +16,24 @@ namespace index const std::string okapi_bm25::id = "bm25"; -okapi_bm25::okapi_bm25(double k1, double b, double k3) : k1_{k1}, b_{b}, k3_{k3} +okapi_bm25::okapi_bm25(float k1, float b, float k3) : k1_{k1}, b_{b}, k3_{k3} { /* nothing */ } -double okapi_bm25::score_one(const score_data& sd) +float okapi_bm25::score_one(const score_data& sd) { - double doc_len = sd.idx.doc_size(sd.d_id); + float doc_len = sd.idx.doc_size(sd.d_id); // add 1.0 to the IDF to ensure that the result is positive - double IDF = fastapprox::fasterlog( + float IDF = fastapprox::fasterlog( 1.0 + (sd.num_docs - sd.doc_count + 0.5) / (sd.doc_count + 0.5)); - double TF = ((k1_ + 1.0) * sd.doc_term_count) + float TF = ((k1_ + 1.0) * sd.doc_term_count) / ((k1_ * ((1.0 - b_) + b_ * doc_len / sd.avg_dl)) + sd.doc_term_count); - double QTF = ((k3_ + 1.0) * sd.query_term_count) + float QTF = ((k3_ + 1.0) * sd.query_term_count) / (k3_ + sd.query_term_count); return TF * IDF * QTF; diff --git a/src/index/ranker/pivoted_length.cpp b/src/index/ranker/pivoted_length.cpp index 96cace9a8..8c9054492 100644 --- a/src/index/ranker/pivoted_length.cpp +++ b/src/index/ranker/pivoted_length.cpp @@ -15,18 +15,18 @@ namespace index const std::string pivoted_length::id = "pivoted-length"; -pivoted_length::pivoted_length(double s) : s_{s} +pivoted_length::pivoted_length(float s) : s_{s} { /* nothing */ } -double pivoted_length::score_one(const score_data& sd) +float pivoted_length::score_one(const score_data& sd) { - double doc_len = sd.idx.doc_size(sd.d_id); - double TF = 1 + fastapprox::fasterlog( + float doc_len = sd.idx.doc_size(sd.d_id); + float TF = 1 + fastapprox::fasterlog( 1 + fastapprox::fasterlog(sd.doc_term_count)); - double norm = (1 - s_) + s_ * (doc_len / sd.avg_dl); - double IDF + float norm = (1 - s_) + s_ * (doc_len / sd.avg_dl); + float IDF = fastapprox::fasterlog((sd.num_docs + 1) / (0.5 + sd.doc_count)); return TF / norm * sd.query_term_count * IDF; } diff --git a/src/index/ranker/ranker.cpp b/src/index/ranker/ranker.cpp index ba976c3b4..ee67e2a4e 100644 --- a/src/index/ranker/ranker.cpp +++ b/src/index/ranker/ranker.cpp @@ -18,7 +18,6 @@ namespace index namespace { - struct postings_context { using postings_data_type = inverted_index::postings_data_type; @@ -46,7 +45,7 @@ struct postings_context }; } -std::vector> ranker::score( +std::vector> ranker::score( inverted_index& idx, corpus::document& query, uint64_t num_results /* = 10 */, const std::function& filter /* return true */) @@ -57,11 +56,11 @@ std::vector> ranker::score( score_data sd{idx, idx.avg_doc_length(), idx.num_docs(), idx.total_corpus_terms(), query}; - std::vector> results; + std::vector> results; results.reserve(num_results + 1); // +1 since we use this as a heap and // prune when it exceeds size num_results - auto comp = [](const std::pair& a, - const std::pair& b) + auto comp = [](const std::pair& a, + const std::pair& b) { // comparison is reversed since we want a min-heap return a.second > b.second; @@ -98,7 +97,7 @@ std::vector> ranker::score( sd.doc_size = idx.doc_size(cur_doc); sd.doc_unique_terms = idx.unique_terms(cur_doc); - double score = initial_score(sd); + auto score = initial_score(sd); for (auto& pc : postings) { if (pc.begin == pc.end) @@ -152,7 +151,7 @@ std::vector> ranker::score( return results; } -double ranker::initial_score(const score_data&) const +float ranker::initial_score(const score_data&) const { return 0.0; } diff --git a/src/index/tools/interactive-search.cpp b/src/index/tools/interactive-search.cpp index 17aa910fb..7451355b7 100644 --- a/src/index/tools/interactive-search.cpp +++ b/src/index/tools/interactive-search.cpp @@ -78,7 +78,7 @@ int main(int argc, char* argv[]) query.content(text); // set the doc's content to be user input // Use the ranker to score the query over the index. - std::vector> ranking; + std::vector> ranking; auto time = common::time([&]() { ranking = ranker->score(*idx, query, 5); }); diff --git a/src/test/ir_eval_test.cpp b/src/test/ir_eval_test.cpp index b4a885eed..f8d6b5545 100644 --- a/src/test/ir_eval_test.cpp +++ b/src/test/ir_eval_test.cpp @@ -12,7 +12,7 @@ namespace testing { void check_query(index::ir_eval& eval, - const std::vector>& ranking, + const std::vector>& ranking, query_id qid, double e_f1, double e_p, double e_r, double e_avg_p, double e_ndcg, uint64_t num_docs = std::numeric_limits::max()) @@ -79,7 +79,7 @@ int ir_eval_results() ASSERT_APPROX_EQUAL(eval.gmap(), 0.0); // make some fake results based on the loaded qrels file - std::vector> results; + std::vector> results; query_id qid{0}; auto idcg_5 = 1.0 + 1.0 / std::log2(3.0) + 1.0 / std::log2(4.0) + 1.0 / std::log2(5.0) + 1.0 / std::log2(6.0); From 422cf72ea273593ebd88912102f536db4aabba5b Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 22 Apr 2015 11:24:00 -0500 Subject: [PATCH 105/481] correct variable name in absolute discount formula --- src/index/ranker/absolute_discount.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/index/ranker/absolute_discount.cpp b/src/index/ranker/absolute_discount.cpp index 2c4706dd8..c28c7b48b 100644 --- a/src/index/ranker/absolute_discount.cpp +++ b/src/index/ranker/absolute_discount.cpp @@ -38,8 +38,8 @@ template <> std::unique_ptr make_ranker(const cpptoml::table& config) { - if (auto gamma = config.get_as("gamma")) - return make_unique(*gamma); + if (auto delta = config.get_as("delta")) + return make_unique(*delta); return make_unique(); } } From 738292b07b6a5f14da2ce5e53c00f618b3f74a60 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 22 Apr 2015 12:04:19 -0500 Subject: [PATCH 106/481] replace std::pair with search_result object --- include/classify/classifier/knn.h | 2 +- include/index/eval/ir_eval.h | 3 +- include/index/ranker/ranker.h | 14 ++++++++- src/classify/classifier/knn.cpp | 10 +++---- src/index/eval/ir_eval.cpp | 39 +++++++++++++------------- src/index/ranker/ranker.cpp | 9 +++--- src/index/tools/interactive-search.cpp | 32 ++++++++++++--------- src/index/tools/query-runner.cpp | 4 +-- src/index/tools/search.cpp | 15 +++++----- src/test/ir_eval_test.cpp | 5 ++-- src/test/ranker_test.cpp | 6 ++-- 11 files changed, 80 insertions(+), 59 deletions(-) diff --git a/include/classify/classifier/knn.h b/include/classify/classifier/knn.h index 24012e0a8..650da01be 100644 --- a/include/classify/classifier/knn.h +++ b/include/classify/classifier/knn.h @@ -69,7 +69,7 @@ class knn : public classifier * @return the best label */ class_label select_best_label( - const std::vector>& scored, + const std::vector& scored, const std::vector>& sorted) const; /** the inverted index used for ranking */ diff --git a/include/index/eval/ir_eval.h b/include/index/eval/ir_eval.h index 85da1fc3a..71f98946c 100644 --- a/include/index/eval/ir_eval.h +++ b/include/index/eval/ir_eval.h @@ -16,6 +16,7 @@ #include #include #include +#include "index/ranker/ranker.h" #include "meta.h" namespace meta @@ -30,7 +31,7 @@ namespace index class ir_eval { public: - using result_type = std::vector>; + using result_type = std::vector; /** * @param config_file Path to cpptoml configuration file diff --git a/include/index/ranker/ranker.h b/include/index/ranker/ranker.h index faaabf235..b895c1936 100644 --- a/include/index/ranker/ranker.h +++ b/include/index/ranker/ranker.h @@ -34,6 +34,18 @@ namespace meta namespace index { +/** + * A simple struct to hold scored document data. + */ +struct search_result +{ + search_result(doc_id id, float s) : d_id{id}, score{s} + { + } + doc_id d_id; + float score; +}; + /** * A ranker scores a query against all the documents in an inverted index, * returning a list of documents sorted by relevance. @@ -48,7 +60,7 @@ class ranker * @param filter A filtering function to apply to each doc_id; returns true * if the document should be included in results */ - std::vector> + std::vector score(inverted_index& idx, corpus::document& query, uint64_t num_results = 10, const std::function& filter = [](doc_id) diff --git a/src/classify/classifier/knn.cpp b/src/classify/classifier/knn.cpp index 7d8b40748..d1d707081 100644 --- a/src/classify/classifier/knn.cpp +++ b/src/classify/classifier/knn.cpp @@ -58,10 +58,10 @@ class_label knn::classify(doc_id d_id) // normally, weighted k-nn weights neighbors by 1/distance, but since // our scores are similarity scores, we weight by the similarity if (weighted_) - counts[idx_->label(s.first)] += s.second; + counts[idx_->label(s.d_id)] += s.score; // if not weighted, each neighbor gets an equal vote else - ++counts[idx_->label(s.first)]; + ++counts[idx_->label(s.d_id)]; } if (counts.empty()) @@ -78,7 +78,7 @@ class_label knn::classify(doc_id d_id) } class_label knn::select_best_label( - const std::vector>& scored, + const std::vector& scored, const std::vector>& sorted) const { uint16_t highest = sorted.begin()->second; @@ -97,9 +97,9 @@ class_label knn::select_best_label( // since there is a tie, return the class label that appeared first in the // rankings; this will usually only happen if the neighbor scores are not // weighted - for (auto& p : scored) + for (auto& result : scored) { - class_label lbl{inv_idx_->label(p.first)}; + class_label lbl{inv_idx_->label(result.d_id)}; auto f = best.find(lbl); if (f != best.end()) return *f; diff --git a/src/index/eval/ir_eval.cpp b/src/index/eval/ir_eval.cpp index c3a5856b1..9bf400826 100644 --- a/src/index/eval/ir_eval.cpp +++ b/src/index/eval/ir_eval.cpp @@ -48,7 +48,9 @@ void ir_eval::init_index(const std::string& path) { std::getline(in, line); bool trec = (std::count_if(line.begin(), line.end(), [](char ch) - { return ch == ' '; }) == 3); // 3 spaces == 4 columns + { + return ch == ' '; + }) == 3); // 3 spaces == 4 columns std::istringstream iss{line}; iss >> q_id; if (trec) @@ -61,7 +63,7 @@ void ir_eval::init_index(const std::string& path) } } -double ir_eval::precision(const std::vector>& results, +double ir_eval::precision(const std::vector& results, query_id q_id, uint64_t num_docs) const { if (results.empty()) @@ -75,8 +77,8 @@ double ir_eval::precision(const std::vector>& results, return relevant_retrieved(results, q_id, num_docs) / denominator; } -double ir_eval::recall(const std::vector>& results, - query_id q_id, uint64_t num_docs) const +double ir_eval::recall(const std::vector& results, query_id q_id, + uint64_t num_docs) const { if (results.empty()) return 0.0; @@ -88,16 +90,15 @@ double ir_eval::recall(const std::vector>& results, return relevant_retrieved(results, q_id, num_docs) / ht->second.size(); } -double ir_eval::relevant_retrieved(const std::vector - >& results, +double ir_eval::relevant_retrieved(const std::vector& results, query_id q_id, uint64_t num_docs) const { double rel = 0.0; const auto& ht = qrels_.find(q_id); uint64_t i = 1; - for (auto& res : results) + for (auto& result : results) { - if (map::safe_at(ht->second, res.first) != 0) + if (map::safe_at(ht->second, result.d_id) != 0) ++rel; if (i++ == num_docs) break; @@ -106,8 +107,8 @@ double ir_eval::relevant_retrieved(const std::vector return rel; } -double ir_eval::f1(const std::vector>& results, - query_id q_id, uint64_t num_docs, double beta) const +double ir_eval::f1(const std::vector& results, query_id q_id, + uint64_t num_docs, double beta) const { double p = precision(results, q_id, num_docs); double r = recall(results, q_id, num_docs); @@ -120,8 +121,8 @@ double ir_eval::f1(const std::vector>& results, return numerator / denominator; } -double ir_eval::ndcg(const std::vector>& results, - query_id q_id, uint64_t num_docs) const +double ir_eval::ndcg(const std::vector& results, query_id q_id, + uint64_t num_docs) const { // find this query's judgements const auto& ht = qrels_.find(q_id); @@ -131,9 +132,9 @@ double ir_eval::ndcg(const std::vector>& results, // calculate discounted cumulative gain double dcg = 0.0; uint64_t i = 1; - for (auto& res : results) + for (auto& result : results) { - auto rel = map::safe_at(ht->second, res.first); // 0 if non-relevant + auto rel = map::safe_at(ht->second, result.d_id); // 0 if non-relevant dcg += (std::pow(2.0, rel) - 1.0) / std::log2(i + 1.0); if (i++ == num_docs) break; @@ -157,8 +158,8 @@ double ir_eval::ndcg(const std::vector>& results, return dcg / idcg; } -double ir_eval::avg_p(const std::vector>& results, - query_id q_id, uint64_t num_docs) +double ir_eval::avg_p(const std::vector& results, query_id q_id, + uint64_t num_docs) { const auto& ht = qrels_.find(q_id); if (ht == qrels_.end() || results.empty()) @@ -173,9 +174,9 @@ double ir_eval::avg_p(const std::vector>& results, uint64_t i = 1; double avgp = 0.0; double num_rel = 1; - for (auto& res : results) + for (auto& result : results) { - if (map::safe_at(ht->second, res.first) != 0) + if (map::safe_at(ht->second, result.d_id) != 0) { avgp += num_rel / i; ++num_rel; @@ -211,7 +212,7 @@ double ir_eval::gmap() const return std::exp(sum / scores_.size()); } -void ir_eval::print_stats(const std::vector>& results, +void ir_eval::print_stats(const std::vector& results, query_id q_id, std::ostream& out) { auto w1 = std::setw(8); diff --git a/src/index/ranker/ranker.cpp b/src/index/ranker/ranker.cpp index ee67e2a4e..c29654bfe 100644 --- a/src/index/ranker/ranker.cpp +++ b/src/index/ranker/ranker.cpp @@ -45,7 +45,7 @@ struct postings_context }; } -std::vector> ranker::score( +std::vector ranker::score( inverted_index& idx, corpus::document& query, uint64_t num_results /* = 10 */, const std::function& filter /* return true */) @@ -56,14 +56,13 @@ std::vector> ranker::score( score_data sd{idx, idx.avg_doc_length(), idx.num_docs(), idx.total_corpus_terms(), query}; - std::vector> results; + std::vector results; results.reserve(num_results + 1); // +1 since we use this as a heap and // prune when it exceeds size num_results - auto comp = [](const std::pair& a, - const std::pair& b) + auto comp = [](const search_result& a, const search_result& b) { // comparison is reversed since we want a min-heap - return a.second > b.second; + return a.score > b.score; }; std::vector postings; diff --git a/src/index/tools/interactive-search.cpp b/src/index/tools/interactive-search.cpp index 7451355b7..8317cfd25 100644 --- a/src/index/tools/interactive-search.cpp +++ b/src/index/tools/interactive-search.cpp @@ -60,10 +60,11 @@ int main(int argc, char* argv[]) auto ranker = index::make_ranker(*group); // Find the path prefix to each document so we can print out the contents. - std::string prefix = *config.get_as("prefix") - + "/" + *config.get_as("dataset") + "/"; + std::string prefix = *config.get_as("prefix") + "/" + + *config.get_as("dataset") + "/"; - std::cout << "Enter a query, or blank to quit." << std::endl << std::endl; + std::cout << "Enter a query, or blank to quit." << std::endl + << std::endl; std::string text; while (true) @@ -78,21 +79,26 @@ int main(int argc, char* argv[]) query.content(text); // set the doc's content to be user input // Use the ranker to score the query over the index. - std::vector> ranking; + std::vector ranking; auto time = common::time([&]() - { ranking = ranker->score(*idx, query, 5); }); + { + ranking = ranker->score(*idx, query, 5); + }); - std::cout << "Showing top 5 of results (" << time.count() << "ms)" + std::cout << "Showing top 5 results (" << time.count() << "ms)" << std::endl; - for (size_t i = 0; i < ranking.size() && i < 5; ++i) + uint64_t result_num = 1; + for (auto& result : ranking) { - std::string path{idx->doc_path(ranking[i].first)}; - std::cout << printing::make_bold(std::to_string(i + 1) + ". " + path - + " (" - + std::to_string(ranking[i].second) - + ")") << std::endl; - std::cout << get_content(prefix + path) << std::endl << std::endl; + std::string path{idx->doc_path(result.d_id)}; + std::cout << printing::make_bold( + std::to_string(result_num) + ". " + path + " (" + + std::to_string(result.score) + ")") << std::endl; + std::cout << get_content(prefix + path) << std::endl + << std::endl; + if (result_num++ == 5) + break; } std::cout << std::endl; diff --git a/src/index/tools/query-runner.cpp b/src/index/tools/query-runner.cpp index aa4fbb836..7b9ac5c2b 100644 --- a/src/index/tools/query-runner.cpp +++ b/src/index/tools/query-runner.cpp @@ -83,8 +83,8 @@ int main(int argc, char* argv[]) for (auto& result : ranking) { std::cout << result_num << ". " - << idx->doc_name(result.first) << " " - << result.second << std::endl; + << idx->doc_name(result.d_id) << " " + << result.score << std::endl; if (result_num++ == 10) break; } diff --git a/src/index/tools/search.cpp b/src/index/tools/search.cpp index 70f9854fd..8fcb8ad04 100644 --- a/src/index/tools/search.cpp +++ b/src/index/tools/search.cpp @@ -82,15 +82,16 @@ int main(int argc, char* argv[]) // of // 10" docs. auto ranking = ranker->score(*idx, query); - std::cout << "Showing top 10 of " << ranking.size() - << " results." << std::endl; + std::cout << "Showing top 10 results." << std::endl; - // Print out the top ten results. - for (size_t i = 0; i < ranking.size() && i < 10; ++i) + uint64_t result_num = 1; + for (auto& result : ranking) { - std::cout << (i + 1) << ". " - << idx->doc_name(ranking[i].first) << " " - << ranking[i].second << std::endl; + std::cout << result_num << ". " + << idx->doc_name(result.d_id) << " " + << result.score << std::endl; + if (result_num++ == 10) + break; } std::cout << std::endl; diff --git a/src/test/ir_eval_test.cpp b/src/test/ir_eval_test.cpp index f8d6b5545..61f489fc3 100644 --- a/src/test/ir_eval_test.cpp +++ b/src/test/ir_eval_test.cpp @@ -4,6 +4,7 @@ */ #include "test/ir_eval_test.h" +#include "index/ranker/ranker.h" #include "corpus/document.h" namespace meta @@ -12,7 +13,7 @@ namespace testing { void check_query(index::ir_eval& eval, - const std::vector>& ranking, + const std::vector& ranking, query_id qid, double e_f1, double e_p, double e_r, double e_avg_p, double e_ndcg, uint64_t num_docs = std::numeric_limits::max()) @@ -79,7 +80,7 @@ int ir_eval_results() ASSERT_APPROX_EQUAL(eval.gmap(), 0.0); // make some fake results based on the loaded qrels file - std::vector> results; + std::vector results; query_id qid{0}; auto idcg_5 = 1.0 + 1.0 / std::log2(3.0) + 1.0 / std::log2(4.0) + 1.0 / std::log2(5.0) + 1.0 / std::log2(6.0); diff --git a/src/test/ranker_test.cpp b/src/test/ranker_test.cpp index df43a8055..89fa221c4 100644 --- a/src/test/ranker_test.cpp +++ b/src/test/ranker_test.cpp @@ -27,10 +27,10 @@ void test_rank(Ranker& r, Index& idx, const std::string& encoding) // since we're searching for a document already in the index, the same // document should be ranked first, but there are a few duplicate // documents...... - if (ranking[0].first != i) + if (ranking[0].d_id != i) { - ASSERT_EQUAL(ranking[1].first, i); - ASSERT_APPROX_EQUAL(ranking[0].second, ranking[1].second); + ASSERT_EQUAL(ranking[1].d_id, i); + ASSERT_APPROX_EQUAL(ranking[0].score, ranking[1].score); } } } From bcea8abf7a372f2fa389ac2758425823a5d565a7 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 22 Apr 2015 21:19:52 -0500 Subject: [PATCH 107/481] downconvert all default doubles to float in rankers --- include/index/ranker/absolute_discount.h | 2 +- include/index/ranker/dirichlet_prior.h | 2 +- include/index/ranker/jelinek_mercer.h | 2 +- include/index/ranker/okapi_bm25.h | 6 +++--- include/index/ranker/pivoted_length.h | 2 +- src/index/ranker/absolute_discount.cpp | 2 +- src/index/ranker/jelinek_mercer.cpp | 2 +- src/index/ranker/okapi_bm25.cpp | 8 ++++---- src/index/ranker/pivoted_length.cpp | 8 ++++---- 9 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/index/ranker/absolute_discount.h b/include/index/ranker/absolute_discount.h index 73781e013..007020e5a 100644 --- a/include/index/ranker/absolute_discount.h +++ b/include/index/ranker/absolute_discount.h @@ -32,7 +32,7 @@ class absolute_discount : public language_model_ranker /** * @param delta */ - absolute_discount(float delta = 0.7); + absolute_discount(float delta = 0.7f); /** * Calculates the smoothed probability of a term. diff --git a/include/index/ranker/dirichlet_prior.h b/include/index/ranker/dirichlet_prior.h index e5b496b6f..c08291ad3 100644 --- a/include/index/ranker/dirichlet_prior.h +++ b/include/index/ranker/dirichlet_prior.h @@ -27,7 +27,7 @@ class dirichlet_prior : public language_model_ranker const static std::string id; /// Default value of mu - const static constexpr float default_mu = 2000; + const static constexpr float default_mu = 2000.0f; /** * @param mu diff --git a/include/index/ranker/jelinek_mercer.h b/include/index/ranker/jelinek_mercer.h index f1dcf32c3..c0339e9a4 100644 --- a/include/index/ranker/jelinek_mercer.h +++ b/include/index/ranker/jelinek_mercer.h @@ -30,7 +30,7 @@ class jelinek_mercer : public language_model_ranker const static std::string id; /// Default value of lambda - const static constexpr float default_lambda = 0.7; + const static constexpr float default_lambda = 0.7f; /** * @param lambda diff --git a/include/index/ranker/okapi_bm25.h b/include/index/ranker/okapi_bm25.h index ebbf20502..b1542931d 100644 --- a/include/index/ranker/okapi_bm25.h +++ b/include/index/ranker/okapi_bm25.h @@ -27,13 +27,13 @@ class okapi_bm25 : public ranker const static std::string id; /// Default k1, doc term smoothing - const static constexpr float default_k1 = 1.2; + const static constexpr float default_k1 = 1.2f; /// Default b, length normalization - const static constexpr float default_b = 0.75; + const static constexpr float default_b = 0.75f; /// Default k3, query term smoothing - const static constexpr float default_k3 = 500.0; + const static constexpr float default_k3 = 500.0f; /** * @param k1 Doc term smoothing diff --git a/include/index/ranker/pivoted_length.h b/include/index/ranker/pivoted_length.h index 5c6a194a5..9e1930020 100644 --- a/include/index/ranker/pivoted_length.h +++ b/include/index/ranker/pivoted_length.h @@ -29,7 +29,7 @@ class pivoted_length : public ranker const static std::string id; /// Default value of s parameter - const static constexpr float default_s = 0.20; + const static constexpr float default_s = 0.2f; /** * @param s diff --git a/src/index/ranker/absolute_discount.cpp b/src/index/ranker/absolute_discount.cpp index c28c7b48b..15144603b 100644 --- a/src/index/ranker/absolute_discount.cpp +++ b/src/index/ranker/absolute_discount.cpp @@ -23,7 +23,7 @@ absolute_discount::absolute_discount(float delta) : delta_{delta} float absolute_discount::smoothed_prob(const score_data& sd) const { float pc = static_cast(sd.corpus_term_count) / sd.total_terms; - float numerator = std::max(sd.doc_term_count - delta_, 0); + float numerator = std::max(sd.doc_term_count - delta_, 0.0f); float denominator = sd.doc_size; return numerator / denominator + doc_constant(sd) * pc; } diff --git a/src/index/ranker/jelinek_mercer.cpp b/src/index/ranker/jelinek_mercer.cpp index a5e1c534a..6d34c2568 100644 --- a/src/index/ranker/jelinek_mercer.cpp +++ b/src/index/ranker/jelinek_mercer.cpp @@ -24,7 +24,7 @@ float jelinek_mercer::smoothed_prob(const score_data& sd) const float max_likelihood = static_cast(sd.doc_term_count) / sd.doc_size; float pc = static_cast(sd.corpus_term_count) / sd.total_terms; - return (1.0 - lambda_) * max_likelihood + lambda_ * pc; + return (1.0f - lambda_) * max_likelihood + lambda_ * pc; } float jelinek_mercer::doc_constant(const score_data&) const diff --git a/src/index/ranker/okapi_bm25.cpp b/src/index/ranker/okapi_bm25.cpp index 331a6ce91..499c2095d 100644 --- a/src/index/ranker/okapi_bm25.cpp +++ b/src/index/ranker/okapi_bm25.cpp @@ -27,13 +27,13 @@ float okapi_bm25::score_one(const score_data& sd) // add 1.0 to the IDF to ensure that the result is positive float IDF = fastapprox::fasterlog( - 1.0 + (sd.num_docs - sd.doc_count + 0.5) / (sd.doc_count + 0.5)); + 1.0f + (sd.num_docs - sd.doc_count + 0.5f) / (sd.doc_count + 0.5f)); - float TF = ((k1_ + 1.0) * sd.doc_term_count) - / ((k1_ * ((1.0 - b_) + b_ * doc_len / sd.avg_dl)) + float TF = ((k1_ + 1.0f) * sd.doc_term_count) + / ((k1_ * ((1.0f - b_) + b_ * doc_len / sd.avg_dl)) + sd.doc_term_count); - float QTF = ((k3_ + 1.0) * sd.query_term_count) + float QTF = ((k3_ + 1.0f) * sd.query_term_count) / (k3_ + sd.query_term_count); return TF * IDF * QTF; diff --git a/src/index/ranker/pivoted_length.cpp b/src/index/ranker/pivoted_length.cpp index 8c9054492..d355cedf8 100644 --- a/src/index/ranker/pivoted_length.cpp +++ b/src/index/ranker/pivoted_length.cpp @@ -23,11 +23,11 @@ pivoted_length::pivoted_length(float s) : s_{s} float pivoted_length::score_one(const score_data& sd) { float doc_len = sd.idx.doc_size(sd.d_id); - float TF = 1 + fastapprox::fasterlog( - 1 + fastapprox::fasterlog(sd.doc_term_count)); - float norm = (1 - s_) + s_ * (doc_len / sd.avg_dl); + float TF = 1.0f + fastapprox::fasterlog( + 1.0f + fastapprox::fasterlog(sd.doc_term_count)); + float norm = (1.0f - s_) + s_ * (doc_len / sd.avg_dl); float IDF - = fastapprox::fasterlog((sd.num_docs + 1) / (0.5 + sd.doc_count)); + = fastapprox::fasterlog((sd.num_docs + 1.0f) / (0.5f + sd.doc_count)); return TF / norm * sd.query_term_count * IDF; } From dce7a6cb32669e0bf67df40cd70d28172f7a6963 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 22 Apr 2015 21:22:20 -0500 Subject: [PATCH 108/481] use fastlog instead of fasterlog (no performance difference) --- src/index/ranker/lm_ranker.cpp | 4 ++-- src/index/ranker/okapi_bm25.cpp | 2 +- src/index/ranker/pivoted_length.cpp | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/index/ranker/lm_ranker.cpp b/src/index/ranker/lm_ranker.cpp index 8f09f5ac2..84fada862 100644 --- a/src/index/ranker/lm_ranker.cpp +++ b/src/index/ranker/lm_ranker.cpp @@ -21,12 +21,12 @@ float language_model_ranker::score_one(const score_data& sd) float ps = smoothed_prob(sd); float pc = static_cast(sd.corpus_term_count) / sd.total_terms; return sd.query_term_count - * fastapprox::fasterlog(ps / (doc_constant(sd) * pc)); + * fastapprox::fastlog(ps / (doc_constant(sd) * pc)); } float language_model_ranker::initial_score(const score_data& sd) const { - return sd.query.length() * fastapprox::fasterlog(doc_constant(sd)); + return sd.query.length() * fastapprox::fastlog(doc_constant(sd)); } } } diff --git a/src/index/ranker/okapi_bm25.cpp b/src/index/ranker/okapi_bm25.cpp index 499c2095d..9fc9cc757 100644 --- a/src/index/ranker/okapi_bm25.cpp +++ b/src/index/ranker/okapi_bm25.cpp @@ -26,7 +26,7 @@ float okapi_bm25::score_one(const score_data& sd) float doc_len = sd.idx.doc_size(sd.d_id); // add 1.0 to the IDF to ensure that the result is positive - float IDF = fastapprox::fasterlog( + float IDF = fastapprox::fastlog( 1.0f + (sd.num_docs - sd.doc_count + 0.5f) / (sd.doc_count + 0.5f)); float TF = ((k1_ + 1.0f) * sd.doc_term_count) diff --git a/src/index/ranker/pivoted_length.cpp b/src/index/ranker/pivoted_length.cpp index d355cedf8..01947e760 100644 --- a/src/index/ranker/pivoted_length.cpp +++ b/src/index/ranker/pivoted_length.cpp @@ -23,11 +23,11 @@ pivoted_length::pivoted_length(float s) : s_{s} float pivoted_length::score_one(const score_data& sd) { float doc_len = sd.idx.doc_size(sd.d_id); - float TF = 1.0f + fastapprox::fasterlog( - 1.0f + fastapprox::fasterlog(sd.doc_term_count)); + float TF = 1.0f + fastapprox::fastlog( + 1.0f + fastapprox::fastlog(sd.doc_term_count)); float norm = (1.0f - s_) + s_ * (doc_len / sd.avg_dl); float IDF - = fastapprox::fasterlog((sd.num_docs + 1.0f) / (0.5f + sd.doc_count)); + = fastapprox::fastlog((sd.num_docs + 1.0f) / (0.5f + sd.doc_count)); return TF / norm * sd.query_term_count * IDF; } From 751ec7422c03eff9a5a874e4aace26a8bb9c9aa2 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 24 Apr 2015 00:35:47 -0500 Subject: [PATCH 109/481] Fix bug with forward_index creation via uninverting. The issue occurred when the inverted index had blank documents: empty postings data entries were not being created in the uninverted forward index in this case. This change requires that we now always write out the primary key along with the postings data entry to intermediate chunks so we can check it when univerting. --- include/index/postings_data.tcc | 12 +++--------- src/index/forward_index.cpp | 14 +++++++++++++- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/index/postings_data.tcc b/include/index/postings_data.tcc index 01807ee03..e7071d47a 100644 --- a/include/index/postings_data.tcc +++ b/include/index/postings_data.tcc @@ -125,9 +125,7 @@ uint64_t postings_data::write_packed( { uint64_t bytes = 0; - if (std::is_same::value) - bytes += io::write_binary(out, p_id_); - + bytes += io::write_binary(out, p_id_); bytes += write_packed_counts(out); return bytes; @@ -196,12 +194,8 @@ uint64_t postings_data::read_packed(std::istream& in) else in.unget(); - uint64_t bytes = 0; - if (std::is_same::value) - { - io::read_binary(in, p_id_); - bytes += length(p_id_); - } + io::read_binary(in, p_id_); + auto bytes = length(p_id_); uint64_t size; uint64_t total_counts; diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index bf4df97cb..890cbb7be 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -348,12 +348,24 @@ void forward_index::impl::compress(const std::string& filename, printing::progress progress{ " > Compressing postings: ", length, 500, 1024 /* 1KB */ }; - // note: we will be accessing pdata in sorted order + // note: we will be accessing pdata in sorted order, but not every + // doc_id is guaranteed to exist, so we must be mindful of document + // gaps + doc_id last_id{0}; while (auto bytes = pdata.read_packed(in)) { byte_pos += bytes; progress(byte_pos); + + // write out any gaps + for (doc_id d_id{last_id + 1}; d_id < pdata.primary_key(); ++d_id) + { + forward_index::postings_data_type pd{d_id}; + out.write(pd); + } + out.write(pdata); + last_id = pdata.primary_key(); } } From f616fe8dc34d06c32499b7724c97df04d19ad13f Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 27 Apr 2015 00:51:22 -0500 Subject: [PATCH 110/481] Add save/load functionality to naive_bayes classifier. --- include/classify/classifier/naive_bayes.h | 19 +- include/index/postings_data.h | 7 +- include/stats/dirichlet.h | 12 ++ include/stats/dirichlet.tcc | 112 +++++++++++- include/stats/multinomial.h | 12 ++ include/stats/multinomial.tcc | 72 ++++++++ include/test/classifier_test.h | 3 +- include/util/identifiers.h | 10 + src/classify/classifier/naive_bayes.cpp | 59 ++++++ src/test/classifier_test.cpp | 211 ++++++++++++---------- 10 files changed, 415 insertions(+), 102 deletions(-) diff --git a/include/classify/classifier/naive_bayes.h b/include/classify/classifier/naive_bayes.h index c31fb21f2..558809450 100644 --- a/include/classify/classifier/naive_bayes.h +++ b/include/classify/classifier/naive_bayes.h @@ -65,11 +65,29 @@ class naive_bayes : public classifier */ void reset() override; + /** + * Saves the model to a directory. + * @param prefix The directory to save the model files in + */ + void save(const std::string& prefix) const; + + /** + * Loads a model from a directory. + * @param prefix The directory to load the model from + */ + void load(const std::string& prefix); + /** * The identifier for this classifier. */ const static std::string id; + class exception : public std::runtime_error + { + public: + using std::runtime_error::runtime_error; + }; + private: /** * Contains P(term|class) for each class. @@ -90,7 +108,6 @@ template <> std::unique_ptr make_classifier(const cpptoml::table& config, std::shared_ptr idx); - } } #endif diff --git a/include/index/postings_data.h b/include/index/postings_data.h index 861913f33..a492a8f28 100644 --- a/include/index/postings_data.h +++ b/include/index/postings_data.h @@ -47,12 +47,9 @@ class postings_data * only be integral types. */ static_assert( - (std::is_integral::value - || std::is_base_of::value + (util::is_numeric::value || std::is_same::value) - && - (std::is_integral::value - || std::is_base_of::value), + && (util::is_numeric::value), "primary and secondary keys in postings data must be numeric types"); /** diff --git a/include/stats/dirichlet.h b/include/stats/dirichlet.h index ec49b9029..d3c482cfd 100644 --- a/include/stats/dirichlet.h +++ b/include/stats/dirichlet.h @@ -88,6 +88,18 @@ class dirichlet */ void swap(dirichlet& other); + /** + * Writes the dirichlet to a stream. + * @param out The stream to write to + */ + void save(std::ostream& out) const; + + /** + * Reads the dirichlet from a stream. + * @param in The stream to read from + */ + void load(std::istream& in); + private: enum class type diff --git a/include/stats/dirichlet.tcc b/include/stats/dirichlet.tcc index df7a767d6..b3bf28301 100644 --- a/include/stats/dirichlet.tcc +++ b/include/stats/dirichlet.tcc @@ -6,6 +6,8 @@ #include "stats/dirichlet.h" #include "util/identifiers.h" #include "util/shim.h" +#include "io/binary.h" +#include "io/packed.h" namespace meta { @@ -22,14 +24,14 @@ dirichlet::dirichlet(double alpha, uint64_t n) template template dirichlet::dirichlet(Iter begin, Iter end) - : type_{type::SPARSE_ASYMMETRIC}, params_{begin, end} + : type_{type::ASYMMETRIC}, params_{begin, end} { using pair_type = typename Iter::value_type; alpha_sum_ - = std::accumulate(begin, end, [](double accum, const pair_type& b) + = std::accumulate(begin, end, 0.0, [](double accum, const pair_type& b) { - return accum + b.second; - }); + return accum + b.second; + }); } template @@ -140,5 +142,107 @@ void dirichlet::swap(dirichlet& other) } std::swap(alpha_sum_, other.alpha_sum_); } + +namespace dirichlet_detail +{ +template +struct is_packable +{ + const static constexpr bool value + = util::is_numeric::value || std::is_floating_point::value; +}; + +template +typename std::enable_if::value>::type + write(std::ostream& out, const T& elem) +{ + io::packed::write(out, elem); +} + +inline void write(std::ostream& out, const std::string& elem) +{ + io::write_binary(out, elem); +} + +template +typename std::enable_if::value>::type + read(std::istream& in, T& elem) +{ + io::packed::read(in, elem); +} + +inline void read(std::istream& in, std::string& elem) +{ + io::read_binary(in, elem); +} +} + +template +void dirichlet::save(std::ostream& out) const +{ + using namespace dirichlet_detail; + write(out, static_cast(type_)); + switch (type_) + { + case type::SYMMETRIC: + { + write(out, params_.fixed_alpha_); + write(out, + static_cast(alpha_sum_ / params_.fixed_alpha_)); + break; + } + case type::ASYMMETRIC: + { + write(out, params_.sparse_alpha_.size()); + for (const auto& alpha : params_.sparse_alpha_) + { + write(out, alpha.first); + write(out, alpha.second); + } + break; + } + } +} + +template +void dirichlet::load(std::istream& in) +{ + using namespace dirichlet_detail; + uint64_t typ; + auto bytes = io::packed::read(in, typ); + if (bytes == 0) + return; + + type read_type = static_cast(typ); + switch (read_type) + { + case type::SYMMETRIC: + { + double alpha; + io::packed::read(in, alpha); + uint64_t n; + io::packed::read(in, n); + *this = dirichlet{alpha, n}; + break; + } + case type::ASYMMETRIC: + { + uint64_t size; + io::packed::read(in, size); + std::vector> vec; + vec.reserve(size); + for (uint64_t i = 0; i < size; ++i) + { + T event; + read(in, event); + double count; + read(in, count); + vec.emplace_back(std::move(event), count); + } + *this = dirichlet{vec.begin(), vec.end()}; + break; + } + } +} } } diff --git a/include/stats/multinomial.h b/include/stats/multinomial.h index 520ff92d6..59a28d356 100644 --- a/include/stats/multinomial.h +++ b/include/stats/multinomial.h @@ -115,6 +115,18 @@ class multinomial */ multinomial& operator+=(const multinomial& other); + /** + * Saves the distribution to a stream. + * @param out The stream to write to + */ + void save(std::ostream& out) const; + + /** + * Reads the distribution from a stream. + * @param in The stream to read from + */ + void load(std::istream& in); + private: util::sparse_vector counts_; double total_counts_; diff --git a/include/stats/multinomial.tcc b/include/stats/multinomial.tcc index 722a5b40e..d61be95d7 100644 --- a/include/stats/multinomial.tcc +++ b/include/stats/multinomial.tcc @@ -7,6 +7,8 @@ #include #include "stats/multinomial.h" #include "util/identifiers.h" +#include "io/binary.h" +#include "io/packed.h" namespace meta { @@ -104,5 +106,75 @@ multinomial& multinomial::operator+=(const multinomial& rhs) return *this; } +namespace multi_detail +{ +template +struct is_packable +{ + const static constexpr bool value + = util::is_numeric::value || std::is_floating_point::value; +}; + +template +typename std::enable_if::value>::type + write(std::ostream& out, const T& elem) +{ + io::packed::write(out, elem); +} + +inline void write(std::ostream& out, const std::string& elem) +{ + io::write_binary(out, elem); +} + +template +typename std::enable_if::value>::type + read(std::istream& in, T& elem) +{ + io::packed::read(in, elem); +} + +inline void read(std::istream& in, std::string& elem) +{ + io::read_binary(in, elem); +} +} + +template +void multinomial::save(std::ostream& out) const +{ + using namespace multi_detail; + write(out, total_counts_); + write(out, counts_.size()); + for (const auto& count : counts_) + { + write(out, count.first); + write(out, count.second); + } + prior_.save(out); +} + +template +void multinomial::load(std::istream& in) +{ + using namespace multi_detail; + clear(); + double total_counts; + auto bytes = io::packed::read(in, total_counts); + uint64_t size; + bytes += io::packed::read(in, size); + if (bytes == 0) + return; + + total_counts_ = total_counts; + counts_.reserve(size); + for (uint64_t i = 0; i < size; ++i) + { + T event; + read(in, event); + read(in, counts_[event]); + } + prior_.load(in); +} } } diff --git a/include/test/classifier_test.h b/include/test/classifier_test.h index d963b377f..64252ae19 100644 --- a/include/test/classifier_test.h +++ b/include/test/classifier_test.h @@ -37,9 +37,10 @@ void check_cv(Index& idx, Classifier& c, double min_accuracy); * @param idx The index to run the classifier on * @param c The classifier to test * @param min_accuracy The mininum acceptable accuracy + * @param train Whether or not to train the model */ template -void check_split(Index& idx, Classifier& c, double min_accuracy); +void check_split(Index& idx, Classifier& c, double min_accuracy, bool train = true); /** * Runs the classifier tests. diff --git a/include/util/identifiers.h b/include/util/identifiers.h index 6705b8530..fd058fd6c 100644 --- a/include/util/identifiers.h +++ b/include/util/identifiers.h @@ -27,6 +27,16 @@ struct numeric { }; +/** + * Type trait for numeric. + */ +template +struct is_numeric +{ + const static constexpr bool value + = std::is_integral::value || std::is_base_of::value; +}; + /** * Helper class that allows the wrapped type to be hashed into standard * library containers such as unordered_map or unordered_set. diff --git a/src/classify/classifier/naive_bayes.cpp b/src/classify/classifier/naive_bayes.cpp index 36066f681..73ec1b69e 100644 --- a/src/classify/classifier/naive_bayes.cpp +++ b/src/classify/classifier/naive_bayes.cpp @@ -8,6 +8,11 @@ #include "cpptoml.h" #include "classify/classifier/naive_bayes.h" #include "index/postings_data.h" +#if META_HAS_ZLIB +#include "io/gzstream.h" +#endif +#include "io/binary.h" +#include "io/packed.h" namespace meta { @@ -82,6 +87,60 @@ class_label naive_bayes::classify(doc_id d_id) return label; } +void naive_bayes::save(const std::string& prefix) const +{ +#if META_HAS_ZLIB + io::gzofstream tp_out{prefix + "/nb-term-probs.gz"}; + io::gzofstream cp_out{prefix + "/nb-class-probs.gz"}; +#else + std::ofstream tp_out{prefix + "/nb-term-probs", std::ios::binary}; + std::ofstream cp_out{prefix + "/nb-class-probs", std::ios::binary}; +#endif + + io::packed::write(tp_out, term_probs_.size()); + for (const auto& dist : term_probs_) + { + const auto& label = dist.first; + const auto& probs = dist.second; + io::write_binary(tp_out, static_cast(label)); + probs.save(tp_out); + } + class_probs_.save(cp_out); +} + +void naive_bayes::load(const std::string& prefix) +{ +#if META_HAS_ZLIB + io::gzifstream tp_in{prefix + "/nb-term-probs.gz"}; + io::gzifstream cp_in{prefix + "/nb-class-probs.gz"}; +#else + std::ifstream tp_in{prefix + "/nb-term-probs", std::ios::binary}; + std::ifstream cp_in{prefix + "/nb-class-probs", std::ios::binary}; +#endif + + if (!tp_in) + throw exception{"term probability file not found at prefix " + prefix}; + + if (!cp_in) + throw exception{"class probability file not found at prefix " + prefix}; + + uint64_t size; + auto bytes = io::packed::read(tp_in, size); + if (bytes == 0) + throw exception{ + "failed reading term probability file (no size written)"}; + + term_probs_.clear(); + term_probs_.reserve(size); + for (uint64_t i = 0; i < size; ++i) + { + std::string label; + io::read_binary(tp_in, label); + term_probs_[class_label{label}].load(tp_in); + } + class_probs_.load(cp_in); +} + template <> std::unique_ptr make_classifier(const cpptoml::table& config, diff --git a/src/test/classifier_test.cpp b/src/test/classifier_test.cpp index f9457ce95..4568c02cf 100644 --- a/src/test/classifier_test.cpp +++ b/src/test/classifier_test.cpp @@ -23,7 +23,7 @@ void check_cv(Index& idx, Classifier& c, double min_accuracy) } template -void check_split(Index& idx, Classifier& c, double min_accuracy) +void check_split(Index& idx, Classifier& c, double min_accuracy, bool train) { // create splits std::vector docs = idx.docs(); @@ -34,7 +34,8 @@ void check_split(Index& idx, Classifier& c, double min_accuracy) std::vector test_docs{docs.begin(), docs.begin() + split_idx}; // train and test - c.train(train_docs); + if (train) + c.train(train_docs); classify::confusion_matrix mtx = c.test(test_docs); ASSERT_GREATER(mtx.accuracy(), min_accuracy); ASSERT_LESS(mtx.accuracy(), 100.0); @@ -58,110 +59,138 @@ int run_tests(const std::string& type) num_failed += testing::run_test("naive-bayes-cv-" + type, [&]() { - naive_bayes nb{f_idx}; - check_cv(*f_idx, nb, 0.84); - }); + naive_bayes nb{f_idx}; + check_cv(*f_idx, nb, 0.84); + }); num_failed += testing::run_test("naive-bayes-split-" + type, [&]() { - naive_bayes nb{f_idx}; - check_split(*f_idx, nb, 0.83); - }); - - num_failed += testing::run_test("knn-cv-" + type, [&]() - { - knn kn{i_idx, f_idx, 10, make_unique()}; - check_cv(*f_idx, kn, 0.90); - }); - - num_failed += testing::run_test("knn-split-" + type, [&]() - { - knn kn{i_idx, f_idx, 10, make_unique()}; - check_split(*f_idx, kn, 0.88); - }); + naive_bayes nb{f_idx}; + check_split(*f_idx, nb, 0.83); + }); + + num_failed += testing::run_test( + "naive-bayes-save-load-" + type, [&]() + { + { + naive_bayes nb{f_idx}; + check_split(*f_idx, nb, 0.83); + filesystem::make_directory("naive-bayes-test"); + nb.save("naive-bayes-test"); + } + naive_bayes nb{f_idx}; + nb.load("naive-bayes-test"); + check_split(*f_idx, nb, 0.83, false); + }); + + system("rm -rf naive-bayes-test"); + + num_failed += testing::run_test( + "knn-cv-" + type, [&]() + { + knn kn{i_idx, f_idx, 10, make_unique()}; + check_cv(*f_idx, kn, 0.90); + }); + + num_failed += testing::run_test( + "knn-split-" + type, [&]() + { + knn kn{i_idx, f_idx, 10, make_unique()}; + check_split(*f_idx, kn, 0.88); + }); num_failed += testing::run_test("nearest-centroid-cv-" + type, [&]() { - nearest_centroid nc{i_idx, f_idx}; - check_cv(*f_idx, nc, 0.88); - }); + nearest_centroid nc{i_idx, f_idx}; + check_cv(*f_idx, nc, 0.88); + }); num_failed += testing::run_test("nearest-centroid-split-" + type, [&]() { - nearest_centroid nc{i_idx, f_idx}; - check_split(*f_idx, nc, 0.84); - }); - - num_failed += testing::run_test("sgd-cv-" + type, [&]() - { - one_vs_all hinge_sgd{f_idx, [&](class_label positive) - { - return make_unique("sgd-model-test", f_idx, positive, - class_label{"negative"}, - make_unique()); - }}; - check_cv(*f_idx, hinge_sgd, 0.93); - one_vs_all perceptron{f_idx, [&](class_label positive) - { - return make_unique("sgd-model-test", f_idx, positive, - class_label{"negative"}, - make_unique()); - }}; - check_cv(*f_idx, perceptron, 0.89); - }); - - num_failed += testing::run_test("sgd-split-" + type, [&]() - { - one_vs_all hinge_sgd{f_idx, [&](class_label positive) - { - return make_unique("sgd-model-test", f_idx, positive, - class_label{"negative"}, - make_unique()); - }}; - check_split(*f_idx, hinge_sgd, 0.89); - one_vs_all perceptron{f_idx, [&](class_label positive) - { - return make_unique("sgd-model-test", f_idx, positive, - class_label{"negative"}, - make_unique()); - }}; - check_split(*f_idx, perceptron, 0.85); - }); - - num_failed += testing::run_test("log-reg-cv-" + type, [&]() - { - logistic_regression logreg{"logreg-model-test", f_idx}; - check_cv(*f_idx, logreg, 0.92); - }); - - num_failed += testing::run_test("log-reg-split-" + type, [&]() - { - logistic_regression logreg{"logreg-model-test", f_idx}; - check_split(*f_idx, logreg, 0.87); - }); + nearest_centroid nc{i_idx, f_idx}; + check_split(*f_idx, nc, 0.84); + }); + + num_failed += testing::run_test( + "sgd-cv-" + type, [&]() + { + one_vs_all hinge_sgd{f_idx, [&](class_label positive) + { + return make_unique( + "sgd-model-test", f_idx, positive, + class_label{"negative"}, + make_unique()); + }}; + check_cv(*f_idx, hinge_sgd, 0.93); + one_vs_all perceptron{f_idx, [&](class_label positive) + { + return make_unique( + "sgd-model-test", f_idx, positive, + class_label{"negative"}, + make_unique()); + }}; + check_cv(*f_idx, perceptron, 0.89); + }); + + num_failed += testing::run_test( + "sgd-split-" + type, [&]() + { + one_vs_all hinge_sgd{f_idx, [&](class_label positive) + { + return make_unique( + "sgd-model-test", f_idx, positive, + class_label{"negative"}, + make_unique()); + }}; + check_split(*f_idx, hinge_sgd, 0.89); + one_vs_all perceptron{f_idx, [&](class_label positive) + { + return make_unique( + "sgd-model-test", f_idx, positive, + class_label{"negative"}, + make_unique()); + }}; + check_split(*f_idx, perceptron, 0.85); + }); + + num_failed += testing::run_test( + "log-reg-cv-" + type, [&]() + { + logistic_regression logreg{"logreg-model-test", f_idx}; + check_cv(*f_idx, logreg, 0.92); + }); + + num_failed += testing::run_test( + "log-reg-split-" + type, [&]() + { + logistic_regression logreg{"logreg-model-test", f_idx}; + check_split(*f_idx, logreg, 0.87); + }); num_failed += testing::run_test("winnow-cv-" + type, [&]() { - winnow win{f_idx}; - check_cv(*f_idx, win, 0.80); - }); + winnow win{f_idx}; + check_cv(*f_idx, win, 0.80); + }); num_failed += testing::run_test("winnow-split-" + type, [&]() { - winnow win{f_idx}; - // this is *really* low... is winnow broken? - check_split(*f_idx, win, 0.65); - }); - - num_failed += testing::run_test("svm-wrapper-" + type, [&]() - { - auto config = cpptoml::parse_file("test-config.toml"); - auto mod_path = config.get_as("libsvm-modules"); - if (!mod_path) - throw std::runtime_error{"no path for libsvm-modules"}; - svm_wrapper svm{f_idx, *mod_path}; - check_cv(*f_idx, svm, .80); - }); + winnow win{f_idx}; + // this is *really* low... is winnow + // broken? + check_split(*f_idx, win, 0.65); + }); + + num_failed += testing::run_test( + "svm-wrapper-" + type, [&]() + { + auto config = cpptoml::parse_file("test-config.toml"); + auto mod_path = config.get_as("libsvm-modules"); + if (!mod_path) + throw std::runtime_error{"no path for libsvm-modules"}; + svm_wrapper svm{f_idx, *mod_path}; + check_cv(*f_idx, svm, .80); + }); } system("rm -rf ceeaus-*"); From 6570af1fb93c781380fb89a25ac42e08c6bfcc4f Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 27 Apr 2015 01:09:01 -0500 Subject: [PATCH 111/481] Add save/load functionality to svm_wrapper classifier. --- include/classify/classifier/svm_wrapper.h | 12 ++++ src/classify/classifier/svm_wrapper.cpp | 10 +++ src/test/classifier_test.cpp | 75 ++++++++++++++++++----- 3 files changed, 81 insertions(+), 16 deletions(-) diff --git a/include/classify/classifier/svm_wrapper.h b/include/classify/classifier/svm_wrapper.h index 9d3f0cb62..b6b529f09 100644 --- a/include/classify/classifier/svm_wrapper.h +++ b/include/classify/classifier/svm_wrapper.h @@ -86,6 +86,18 @@ class svm_wrapper : public classifier */ void reset() override; + /** + * Loads in a model file. + * @param prefix The folder that contains the model file + */ + void load(const std::string& prefix) const; + + /** + * Saves the model to a file. + * @param prefix The folder to save the model into + */ + void save(const std::string& prefix) const; + /** * The identifier for this classifier. */ diff --git a/src/classify/classifier/svm_wrapper.cpp b/src/classify/classifier/svm_wrapper.cpp index 3b9bfc19c..1c9569ec5 100644 --- a/src/classify/classifier/svm_wrapper.cpp +++ b/src/classify/classifier/svm_wrapper.cpp @@ -107,6 +107,16 @@ void svm_wrapper::reset() // nothing } +void svm_wrapper::load(const std::string& prefix) const +{ + filesystem::copy_file(prefix + "/svm-train.model", "svm-train.model"); +} + +void svm_wrapper::save(const std::string& prefix) const +{ + filesystem::copy_file("svm-train.model", prefix + "/svm-train.model"); +} + template <> std::unique_ptr make_classifier(const cpptoml::table& config, diff --git a/src/test/classifier_test.cpp b/src/test/classifier_test.cpp index 4568c02cf..a6c942a42 100644 --- a/src/test/classifier_test.cpp +++ b/src/test/classifier_test.cpp @@ -69,22 +69,6 @@ int run_tests(const std::string& type) check_split(*f_idx, nb, 0.83); }); - num_failed += testing::run_test( - "naive-bayes-save-load-" + type, [&]() - { - { - naive_bayes nb{f_idx}; - check_split(*f_idx, nb, 0.83); - filesystem::make_directory("naive-bayes-test"); - nb.save("naive-bayes-test"); - } - naive_bayes nb{f_idx}; - nb.load("naive-bayes-test"); - check_split(*f_idx, nb, 0.83, false); - }); - - system("rm -rf naive-bayes-test"); - num_failed += testing::run_test( "knn-cv-" + type, [&]() { @@ -197,6 +181,64 @@ int run_tests(const std::string& type) return num_failed; } +int run_load_save_tests() +{ + using namespace classify; + int num_failed = 0; + + // scope to ensure that the index objects are destroyed before trying + // to delete their directory; this is needed for weirdness on NFS or + // other filesystems that might lock opened files + { + auto i_idx + = index::make_index("test-config.toml"); + auto f_idx + = index::make_index( + "test-config.toml"); + + num_failed += testing::run_test( + "naive-bayes-save-load", [&]() + { + { + naive_bayes nb{f_idx}; + check_split(*f_idx, nb, 0.83); + filesystem::make_directory("naive-bayes-test"); + nb.save("naive-bayes-test"); + } + naive_bayes nb{f_idx}; + nb.load("naive-bayes-test"); + check_split(*f_idx, nb, 0.83, false); + }); + + system("rm -rf naive-bayes-test"); + + num_failed += testing::run_test( + "svm-wrapper-save-load", [&]() + { + auto config = cpptoml::parse_file("test-config.toml"); + auto mod_path = config.get_as("libsvm-modules"); + if (!mod_path) + throw std::runtime_error{"no path for libsvm-modules"}; + { + svm_wrapper svm{f_idx, *mod_path}; + check_split(*f_idx, svm, .80); + filesystem::make_directory("svm-wrapper-test"); + svm.save("svm-wrapper-test"); + } + filesystem::delete_file("svm-train.model"); + svm_wrapper svm{f_idx, *mod_path}; + svm.load("svm-wrapper-test"); + check_split(*f_idx, svm, 0.80); + }); + + system("rm -rf svm-wrapper-test"); + } + + system("rm -rf ceeaus-*"); + + return num_failed; +} + int classifier_tests() { int num_failed = 0; @@ -205,6 +247,7 @@ int classifier_tests() num_failed += run_tests("file"); create_config("line"); num_failed += run_tests("line"); + num_failed += run_load_save_tests(); return num_failed; } } From 34c26ebc76db235571050531de1aae83f987e5fc Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 27 Apr 2015 01:57:44 -0500 Subject: [PATCH 112/481] Add string reading/writing to io::packed. This simplifies reading/writing for stats::multinomial and stats::dirichlet. --- include/io/packed.h | 37 ++++++++++++++++++++++++ include/stats/dirichlet.tcc | 54 ++++++----------------------------- include/stats/multinomial.tcc | 49 ++++--------------------------- 3 files changed, 52 insertions(+), 88 deletions(-) diff --git a/include/io/packed.h b/include/io/packed.h index defc84ad1..d1d66c7bb 100644 --- a/include/io/packed.h +++ b/include/io/packed.h @@ -13,6 +13,7 @@ #include #include #include +#include namespace meta { @@ -97,6 +98,26 @@ uint64_t write(OutputStream& stream, double value) return bytes; } +/** + * Writes a string in a packed representation. At the moment, the most + * efficient thing I can think to do here is just write it out as a + * standard C-string. + * + * @param stream The stream to write to + * @param value The value to write + * @return the number of bytes used to write out the value + */ +template +uint64_t write(OutputStream& stream, const std::string& value) +{ + for (const auto& c : value) + { + stream.put(c); + } + stream.put('\0'); + return value.size() + 1; +} + /** * Reads an unsigned integer from its packed representation. * @@ -159,6 +180,22 @@ uint64_t read(InputStream& stream, double& value) value = mantissa * std::pow(2.0, exponent); return bytes; } + +/** + * Reads a string from its packed representation. + * + * @param stream The stream to read from + * @param value The element to write into + * @return the number of bytes read + */ +template +uint64_t read(InputStream& stream, std::string& value) +{ + value.clear(); + for (auto c = stream.get(); c != '\0'; c = stream.get()) + value += c; + return value.size() + 1; +} } } } diff --git a/include/stats/dirichlet.tcc b/include/stats/dirichlet.tcc index b3bf28301..78c334c56 100644 --- a/include/stats/dirichlet.tcc +++ b/include/stats/dirichlet.tcc @@ -143,61 +143,26 @@ void dirichlet::swap(dirichlet& other) std::swap(alpha_sum_, other.alpha_sum_); } -namespace dirichlet_detail -{ -template -struct is_packable -{ - const static constexpr bool value - = util::is_numeric::value || std::is_floating_point::value; -}; - -template -typename std::enable_if::value>::type - write(std::ostream& out, const T& elem) -{ - io::packed::write(out, elem); -} - -inline void write(std::ostream& out, const std::string& elem) -{ - io::write_binary(out, elem); -} - -template -typename std::enable_if::value>::type - read(std::istream& in, T& elem) -{ - io::packed::read(in, elem); -} - -inline void read(std::istream& in, std::string& elem) -{ - io::read_binary(in, elem); -} -} - template void dirichlet::save(std::ostream& out) const { - using namespace dirichlet_detail; - write(out, static_cast(type_)); + io::packed::write(out, static_cast(type_)); switch (type_) { case type::SYMMETRIC: { - write(out, params_.fixed_alpha_); - write(out, - static_cast(alpha_sum_ / params_.fixed_alpha_)); + io::packed::write(out, params_.fixed_alpha_); + io::packed::write( + out, static_cast(alpha_sum_ / params_.fixed_alpha_)); break; } case type::ASYMMETRIC: { - write(out, params_.sparse_alpha_.size()); + io::packed::write(out, params_.sparse_alpha_.size()); for (const auto& alpha : params_.sparse_alpha_) { - write(out, alpha.first); - write(out, alpha.second); + io::packed::write(out, alpha.first); + io::packed::write(out, alpha.second); } break; } @@ -207,7 +172,6 @@ void dirichlet::save(std::ostream& out) const template void dirichlet::load(std::istream& in) { - using namespace dirichlet_detail; uint64_t typ; auto bytes = io::packed::read(in, typ); if (bytes == 0) @@ -234,9 +198,9 @@ void dirichlet::load(std::istream& in) for (uint64_t i = 0; i < size; ++i) { T event; - read(in, event); + io::packed::read(in, event); double count; - read(in, count); + io::packed::read(in, count); vec.emplace_back(std::move(event), count); } *this = dirichlet{vec.begin(), vec.end()}; diff --git a/include/stats/multinomial.tcc b/include/stats/multinomial.tcc index d61be95d7..a9b083260 100644 --- a/include/stats/multinomial.tcc +++ b/include/stats/multinomial.tcc @@ -7,7 +7,6 @@ #include #include "stats/multinomial.h" #include "util/identifiers.h" -#include "io/binary.h" #include "io/packed.h" namespace meta @@ -106,50 +105,15 @@ multinomial& multinomial::operator+=(const multinomial& rhs) return *this; } -namespace multi_detail -{ -template -struct is_packable -{ - const static constexpr bool value - = util::is_numeric::value || std::is_floating_point::value; -}; - -template -typename std::enable_if::value>::type - write(std::ostream& out, const T& elem) -{ - io::packed::write(out, elem); -} - -inline void write(std::ostream& out, const std::string& elem) -{ - io::write_binary(out, elem); -} - -template -typename std::enable_if::value>::type - read(std::istream& in, T& elem) -{ - io::packed::read(in, elem); -} - -inline void read(std::istream& in, std::string& elem) -{ - io::read_binary(in, elem); -} -} - template void multinomial::save(std::ostream& out) const { - using namespace multi_detail; - write(out, total_counts_); - write(out, counts_.size()); + io::packed::write(out, total_counts_); + io::packed::write(out, counts_.size()); for (const auto& count : counts_) { - write(out, count.first); - write(out, count.second); + io::packed::write(out, count.first); + io::packed::write(out, count.second); } prior_.save(out); } @@ -157,7 +121,6 @@ void multinomial::save(std::ostream& out) const template void multinomial::load(std::istream& in) { - using namespace multi_detail; clear(); double total_counts; auto bytes = io::packed::read(in, total_counts); @@ -171,8 +134,8 @@ void multinomial::load(std::istream& in) for (uint64_t i = 0; i < size; ++i) { T event; - read(in, event); - read(in, counts_[event]); + io::packed::read(in, event); + io::packed::read(in, counts_[event]); } prior_.load(in); } From 812148502287e4e2884daab1bb84f14afc88caf4 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 1 May 2015 02:57:09 -0500 Subject: [PATCH 113/481] Massage sources for cross-compilation with mingw. I'm able to cross-compile MeTA for Windows on my Linux box, provided I install the correct mingw-w64-xxx packages. --- CMakeLists.txt | 2 +- include/io/mman-win32/mman.h | 55 ++++++ include/util/disk_vector.h | 4 + include/util/filesystem.h | 4 + src/analyzers/filters/CMakeLists.txt | 2 +- src/classify/CMakeLists.txt | 2 +- src/index/CMakeLists.txt | 1 - src/index/ranker/CMakeLists.txt | 1 + src/index/tools/CMakeLists.txt | 6 +- src/io/CMakeLists.txt | 32 ++-- src/io/mman-win32/CMakeLists.txt | 4 + src/io/mman-win32/mman.c | 180 ++++++++++++++++++ src/io/mmap_file.cpp | 5 + src/lm/CMakeLists.txt | 2 +- src/parser/CMakeLists.txt | 4 +- .../analyzers/featurizers/CMakeLists.txt | 3 +- src/parser/{trees => }/evalb.cpp | 0 src/parser/tools/CMakeLists.txt | 2 +- src/parser/trees/CMakeLists.txt | 4 +- src/parser/trees/visitors/CMakeLists.txt | 18 +- src/sequence/CMakeLists.txt | 5 +- src/sequence/io/CMakeLists.txt | 2 +- src/test/CMakeLists.txt | 2 +- 23 files changed, 299 insertions(+), 41 deletions(-) create mode 100644 include/io/mman-win32/mman.h create mode 100644 src/io/mman-win32/CMakeLists.txt create mode 100644 src/io/mman-win32/mman.c rename src/parser/{trees => }/evalb.cpp (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index bdc5a05a0..7cc864453 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ if(ZLIB_FOUND) include_directories(SYSTEM ${ZLIB_INCLUDE_DIRS}) endif() -if(UNIX) +if(UNIX OR MINGW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic") # if we don't already set the standard for the compiler, detect the diff --git a/include/io/mman-win32/mman.h b/include/io/mman-win32/mman.h new file mode 100644 index 000000000..56c30ed2c --- /dev/null +++ b/include/io/mman-win32/mman.h @@ -0,0 +1,55 @@ +/* + * sys/mman.h + * mman-win32 + */ + +#ifndef _SYS_MMAN_H_ +#define _SYS_MMAN_H_ + +#ifndef _WIN32_WINNT // Allow use of features specific to Windows XP or later. +#define _WIN32_WINNT 0x0501 // Change this to the appropriate value to target other versions of Windows. +#endif + +/* All the headers include this file. */ +#ifndef _MSC_VER +#include <_mingw.h> +#endif + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define PROT_NONE 0 +#define PROT_READ 1 +#define PROT_WRITE 2 +#define PROT_EXEC 4 + +#define MAP_FILE 0 +#define MAP_SHARED 1 +#define MAP_PRIVATE 2 +#define MAP_TYPE 0xf +#define MAP_FIXED 0x10 +#define MAP_ANONYMOUS 0x20 +#define MAP_ANON MAP_ANONYMOUS + +#define MAP_FAILED ((void *)-1) + +/* Flags for msync. */ +#define MS_ASYNC 1 +#define MS_SYNC 2 +#define MS_INVALIDATE 4 + +void* mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off); +int munmap(void *addr, size_t len); +int mprotect(void *addr, size_t len, int prot); +int msync(void *addr, size_t len, int flags); +int mlock(const void *addr, size_t len); +int munlock(const void *addr, size_t len); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MMAN_H_ */ diff --git a/include/util/disk_vector.h b/include/util/disk_vector.h index 3687abc51..890173f5e 100644 --- a/include/util/disk_vector.h +++ b/include/util/disk_vector.h @@ -14,7 +14,11 @@ #include #include #include +#ifndef _WIN32 #include +#else +#include "mman.h" +#endif #include #include "meta.h" diff --git a/include/util/filesystem.h b/include/util/filesystem.h index f99129c80..c6df09c88 100644 --- a/include/util/filesystem.h +++ b/include/util/filesystem.h @@ -51,7 +51,11 @@ inline void rename_file(const std::string& old_name, */ inline bool make_directory(const std::string& dir_name) { +#ifndef _WIN32 return mkdir(dir_name.c_str(), 0755) == -1; +#else + return mkdir(dir_name.c_str()) == -1; +#endif } /** diff --git a/src/analyzers/filters/CMakeLists.txt b/src/analyzers/filters/CMakeLists.txt index ebb4f2943..6ace17403 100644 --- a/src/analyzers/filters/CMakeLists.txt +++ b/src/analyzers/filters/CMakeLists.txt @@ -11,4 +11,4 @@ add_library(meta-filters alpha_filter.cpp porter2_stemmer.cpp ptb_normalizer.cpp sentence_boundary.cpp) -target_link_libraries(meta-filters meta-utf porter2-stemmer) +target_link_libraries(meta-filters meta-utf porter2-stemmer meta-tokenizers) diff --git a/src/classify/CMakeLists.txt b/src/classify/CMakeLists.txt index 0954a9240..7fdd28c18 100644 --- a/src/classify/CMakeLists.txt +++ b/src/classify/CMakeLists.txt @@ -34,5 +34,5 @@ add_library(meta-classify binary_classifier_factory.cpp classifier/winnow.cpp classifier_factory.cpp confusion_matrix.cpp) -target_link_libraries(meta-classify meta-index meta-loss) +target_link_libraries(meta-classify meta-ranker meta-loss) add_dependencies(meta-classify liblinear libsvm) diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 657a4405a..0045a5d88 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -15,5 +15,4 @@ add_library(meta-index disk_index.cpp vocabulary_map_writer.cpp) target_link_libraries(meta-index meta-analyzers meta-eval - meta-ranker ${CMAKE_THREAD_LIBS_INIT}) diff --git a/src/index/ranker/CMakeLists.txt b/src/index/ranker/CMakeLists.txt index ba1de219a..e00a24a22 100644 --- a/src/index/ranker/CMakeLists.txt +++ b/src/index/ranker/CMakeLists.txt @@ -8,3 +8,4 @@ add_library(meta-ranker absolute_discount.cpp pivoted_length.cpp ranker.cpp ranker_factory.cpp) +target_link_libraries(meta-ranker meta-index) diff --git a/src/index/tools/CMakeLists.txt b/src/index/tools/CMakeLists.txt index ebbdda31b..7d325ca98 100644 --- a/src/index/tools/CMakeLists.txt +++ b/src/index/tools/CMakeLists.txt @@ -1,10 +1,10 @@ add_executable(query-runner query-runner.cpp) -target_link_libraries(query-runner meta-index +target_link_libraries(query-runner meta-ranker meta-sequence-analyzers meta-parser-analyzers) add_executable(search search.cpp) -target_link_libraries(search meta-index +target_link_libraries(search meta-ranker meta-sequence-analyzers meta-parser-analyzers) @@ -14,7 +14,7 @@ target_link_libraries(index meta-index meta-parser-analyzers) add_executable(interactive-search interactive-search.cpp) -target_link_libraries(interactive-search meta-index +target_link_libraries(interactive-search meta-ranker meta-sequence-analyzers meta-parser-analyzers) diff --git a/src/io/CMakeLists.txt b/src/io/CMakeLists.txt index 45d09c1fd..72d742804 100644 --- a/src/io/CMakeLists.txt +++ b/src/io/CMakeLists.txt @@ -2,19 +2,23 @@ project(meta-io) add_subdirectory(tools) +set(IO_SOURCES compressed_file_reader.cpp + compressed_file_writer.cpp + libsvm_parser.cpp + mmap_file.cpp + parser.cpp) + +set(IO_DEPS meta-util) + if (ZLIB_FOUND) - add_library(meta-io compressed_file_reader.cpp - compressed_file_writer.cpp - gzstream.cpp - libsvm_parser.cpp - mmap_file.cpp - parser.cpp) - target_link_libraries(meta-io meta-util ${ZLIB_LIBRARIES}) -else() - add_library(meta-io compressed_file_reader.cpp - compressed_file_writer.cpp - libsvm_parser.cpp - mmap_file.cpp - parser.cpp) - target_link_libraries(meta-io meta-util) + list(APPEND IO_SOURCES gzstream.cpp) + list(APPEND IO_DEPS ${ZLIB_LIBRARIES}) endif() + +if (WIN32) + add_subdirectory(mman-win32) + list(APPEND IO_DEPS mman-win32) +endif() + +add_library(meta-io ${IO_SOURCES}) +target_link_libraries(meta-io ${IO_DEPS}) diff --git a/src/io/mman-win32/CMakeLists.txt b/src/io/mman-win32/CMakeLists.txt new file mode 100644 index 000000000..fd1ac5a65 --- /dev/null +++ b/src/io/mman-win32/CMakeLists.txt @@ -0,0 +1,4 @@ +include_directories(${PROJECT_SOURCE_DIR}/../../include/io/mman-win32) +add_library(mman-win32 mman.c) +target_include_directories(mman-win32 PUBLIC + ${PROJECT_SOURCE_DIR}/../../include/io/mman-win32) diff --git a/src/io/mman-win32/mman.c b/src/io/mman-win32/mman.c new file mode 100644 index 000000000..5b0bc7899 --- /dev/null +++ b/src/io/mman-win32/mman.c @@ -0,0 +1,180 @@ + +#include +#include +#include + +#include "mman.h" + +#ifndef FILE_MAP_EXECUTE +#define FILE_MAP_EXECUTE 0x0020 +#endif /* FILE_MAP_EXECUTE */ + +static int __map_mman_error(const DWORD err, const int deferr) +{ + if (err == 0) + return 0; + //TODO: implement + return err; +} + +static DWORD __map_mmap_prot_page(const int prot) +{ + DWORD protect = 0; + + if (prot == PROT_NONE) + return protect; + + if ((prot & PROT_EXEC) != 0) + { + protect = ((prot & PROT_WRITE) != 0) ? + PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ; + } + else + { + protect = ((prot & PROT_WRITE) != 0) ? + PAGE_READWRITE : PAGE_READONLY; + } + + return protect; +} + +static DWORD __map_mmap_prot_file(const int prot) +{ + DWORD desiredAccess = 0; + + if (prot == PROT_NONE) + return desiredAccess; + + if ((prot & PROT_READ) != 0) + desiredAccess |= FILE_MAP_READ; + if ((prot & PROT_WRITE) != 0) + desiredAccess |= FILE_MAP_WRITE; + if ((prot & PROT_EXEC) != 0) + desiredAccess |= FILE_MAP_EXECUTE; + + return desiredAccess; +} + +void* mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off) +{ + HANDLE fm, h; + + void * map = MAP_FAILED; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4293) +#endif + + const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)off : (DWORD)(off & 0xFFFFFFFFL); + const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL); + const DWORD protect = __map_mmap_prot_page(prot); + const DWORD desiredAccess = __map_mmap_prot_file(prot); + + const off_t maxSize = off + (off_t)len; + + const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL); + const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)0 : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL); + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + errno = 0; + + if (len == 0 + /* Unsupported flag combinations */ + || (flags & MAP_FIXED) != 0 + /* Usupported protection combinations */ + || prot == PROT_EXEC) + { + errno = EINVAL; + return MAP_FAILED; + } + + h = ((flags & MAP_ANONYMOUS) == 0) ? + (HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE; + + if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) + { + errno = EBADF; + return MAP_FAILED; + } + + fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL); + + if (fm == NULL) + { + errno = __map_mman_error(GetLastError(), EPERM); + return MAP_FAILED; + } + + map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len); + + CloseHandle(fm); + + if (map == NULL) + { + errno = __map_mman_error(GetLastError(), EPERM); + return MAP_FAILED; + } + + return map; +} + +int munmap(void *addr, size_t len) +{ + if (UnmapViewOfFile(addr)) + return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +int mprotect(void *addr, size_t len, int prot) +{ + DWORD newProtect = __map_mmap_prot_page(prot); + DWORD oldProtect = 0; + + if (VirtualProtect(addr, len, newProtect, &oldProtect)) + return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +int msync(void *addr, size_t len, int flags) +{ + if (FlushViewOfFile(addr, len)) + return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +int mlock(const void *addr, size_t len) +{ + if (VirtualLock((LPVOID)addr, len)) + return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +int munlock(const void *addr, size_t len) +{ + if (VirtualUnlock((LPVOID)addr, len)) + return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} diff --git a/src/io/mmap_file.cpp b/src/io/mmap_file.cpp index 992214d35..6b37b0ee3 100644 --- a/src/io/mmap_file.cpp +++ b/src/io/mmap_file.cpp @@ -3,7 +3,12 @@ * @author Sean Massung */ +#ifndef _WIN32 #include +#else +#include "mman.h" +#endif + #include #include #include diff --git a/src/lm/CMakeLists.txt b/src/lm/CMakeLists.txt index c512aac26..b0c063a9d 100644 --- a/src/lm/CMakeLists.txt +++ b/src/lm/CMakeLists.txt @@ -3,4 +3,4 @@ project(meta-language-model) add_subdirectory(tools) add_library(meta-language-model language_model.cpp) -target_link_libraries(meta-language-model meta-corpus) +target_link_libraries(meta-language-model meta-corpus meta-analyzers) diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt index 0642952ef..e561140cc 100644 --- a/src/parser/CMakeLists.txt +++ b/src/parser/CMakeLists.txt @@ -10,7 +10,9 @@ add_library(meta-parser sr_parser.cpp state_analyzer.cpp training_data.cpp transition.cpp - transition_map.cpp) + transition_map.cpp + evalb.cpp + $) target_link_libraries(meta-parser meta-parser-trees meta-parser-io meta-io diff --git a/src/parser/analyzers/featurizers/CMakeLists.txt b/src/parser/analyzers/featurizers/CMakeLists.txt index bdf934e2a..0318d6e54 100644 --- a/src/parser/analyzers/featurizers/CMakeLists.txt +++ b/src/parser/analyzers/featurizers/CMakeLists.txt @@ -6,4 +6,5 @@ add_library(meta-parser-featurizers branch_featurizer.cpp subtree_featurizer.cpp tag_featurizer.cpp) -target_link_libraries(meta-parser-featurizers meta-parser-trees) +target_link_libraries(meta-parser-featurizers meta-parser-trees + meta-corpus) diff --git a/src/parser/trees/evalb.cpp b/src/parser/evalb.cpp similarity index 100% rename from src/parser/trees/evalb.cpp rename to src/parser/evalb.cpp diff --git a/src/parser/tools/CMakeLists.txt b/src/parser/tools/CMakeLists.txt index d59cdbccf..322c434cf 100644 --- a/src/parser/tools/CMakeLists.txt +++ b/src/parser/tools/CMakeLists.txt @@ -1,5 +1,5 @@ add_executable(read-trees read_trees.cpp) -target_link_libraries(read-trees meta-parser-io) +target_link_libraries(read-trees meta-parser) add_executable(parser-train parser_train.cpp) target_link_libraries(parser-train meta-parser meta-util) diff --git a/src/parser/trees/CMakeLists.txt b/src/parser/trees/CMakeLists.txt index 3400a57f2..7aedea67c 100644 --- a/src/parser/trees/CMakeLists.txt +++ b/src/parser/trees/CMakeLists.txt @@ -2,9 +2,7 @@ project(meta-parser-trees) add_subdirectory(visitors) -add_library(meta-parser-trees evalb.cpp - leaf_node.cpp +add_library(meta-parser-trees leaf_node.cpp node.cpp internal_node.cpp parse_tree.cpp) -target_link_libraries(meta-parser-trees meta-tree-visitors) diff --git a/src/parser/trees/visitors/CMakeLists.txt b/src/parser/trees/visitors/CMakeLists.txt index b13f4551c..adffffeb9 100644 --- a/src/parser/trees/visitors/CMakeLists.txt +++ b/src/parser/trees/visitors/CMakeLists.txt @@ -1,11 +1,11 @@ project(meta-tree-visitors) -add_library(meta-tree-visitors annotation_remover.cpp - binarizer.cpp - debinarizer.cpp - empty_remover.cpp - head_finder.cpp - leaf_node_finder.cpp - sequence_extractor.cpp - transition_finder.cpp - unary_chain_remover.cpp) +add_library(meta-tree-visitors OBJECT annotation_remover.cpp + binarizer.cpp + debinarizer.cpp + empty_remover.cpp + head_finder.cpp + leaf_node_finder.cpp + sequence_extractor.cpp + transition_finder.cpp + unary_chain_remover.cpp) diff --git a/src/sequence/CMakeLists.txt b/src/sequence/CMakeLists.txt index fbc44d4e6..4a60b15f3 100644 --- a/src/sequence/CMakeLists.txt +++ b/src/sequence/CMakeLists.txt @@ -8,8 +8,9 @@ add_subdirectory(tools) add_library(meta-sequence observation.cpp sequence.cpp sequence_analyzer.cpp - trellis.cpp) -target_link_libraries(meta-sequence meta-sequence-io meta-io meta-utf) + trellis.cpp + $) +target_link_libraries(meta-sequence meta-io meta-utf) add_library(meta-greedy-tagger perceptron.cpp) target_link_libraries(meta-greedy-tagger meta-sequence meta-io) diff --git a/src/sequence/io/CMakeLists.txt b/src/sequence/io/CMakeLists.txt index 009e510db..21d5beb08 100644 --- a/src/sequence/io/CMakeLists.txt +++ b/src/sequence/io/CMakeLists.txt @@ -1,3 +1,3 @@ project(meta-sequence-io) -add_library(meta-sequence-io ptb_parser.cpp) +add_library(meta-sequence-io OBJECT ptb_parser.cpp) diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 9cf1b4b05..026d20c0f 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -15,7 +15,7 @@ add_library(meta-testing analyzer_test.cpp graph_test.cpp vocabulary_map_test.cpp parser_test.cpp) -target_link_libraries(meta-testing meta-index meta-classify meta-parser-io) +target_link_libraries(meta-testing meta-index meta-classify meta-parser) set(UNIT_TEST_EXE unit-test) include(unit_tests.cmake) From 363c084bce713bc53f6fc54b1532d7e8b67d1a33 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 2 May 2015 00:58:36 -0500 Subject: [PATCH 114/481] Finish porting for MSYS2/MinGW64. The unit tests now pass under this environment! --- include/io/packed.h | 2 +- include/util/filesystem.h | 8 ++++---- src/classify/CMakeLists.txt | 4 ++-- src/classify/classifier/svm_wrapper.cpp | 25 +++++++++++++++++++++++++ src/io/compressed_file_writer.cpp | 2 +- 5 files changed, 33 insertions(+), 8 deletions(-) diff --git a/include/io/packed.h b/include/io/packed.h index d1d66c7bb..a2089d476 100644 --- a/include/io/packed.h +++ b/include/io/packed.h @@ -82,7 +82,7 @@ uint64_t write(OutputStream& stream, double value) int exp; auto digits = std::numeric_limits::digits; auto mantissa - = static_cast(std::frexp(value, &exp) * (1ul << digits)); + = static_cast(std::frexp(value, &exp) * (uint64_t{1} << digits)); int64_t exponent = exp - digits; // see dlib link above; tries to shrink mantissa for more efficient diff --git a/include/util/filesystem.h b/include/util/filesystem.h index c6df09c88..11a12f000 100644 --- a/include/util/filesystem.h +++ b/include/util/filesystem.h @@ -111,8 +111,8 @@ inline bool copy_file(const std::string& source, const std::string& dest) if (size > max_size) { printing::progress prog{"Copying file ", size}; - std::ifstream source_file{source}; - std::ofstream dest_file{dest}; + std::ifstream source_file{source, std::ios::binary}; + std::ofstream dest_file{dest, std::ios::binary}; uint64_t buf_size = 1024UL * 1024UL * 32UL; // 32 MB buffer uint64_t total_processed = 0; std::vector buffer(buf_size); @@ -121,8 +121,8 @@ inline bool copy_file(const std::string& source, const std::string& dest) source_file.read(buffer.data(), buf_size); auto processed = source_file.gcount(); total_processed += processed; - dest_file.write(buffer.data(), total_processed); - prog(processed); + dest_file.write(buffer.data(), processed); + prog(total_processed); } prog.end(); } diff --git a/src/classify/CMakeLists.txt b/src/classify/CMakeLists.txt index 7fdd28c18..da6669c6b 100644 --- a/src/classify/CMakeLists.txt +++ b/src/classify/CMakeLists.txt @@ -7,7 +7,7 @@ ExternalProject_Add(liblinear SOURCE_DIR ${meta_SOURCE_DIR}/../deps/libsvm-modules/liblinear BUILD_IN_SOURCE 1 CONFIGURE_COMMAND "" - BUILD_COMMAND make + BUILD_COMMAND CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER} make LOG_BUILD 0 INSTALL_COMMAND "") @@ -15,7 +15,7 @@ ExternalProject_Add(libsvm SOURCE_DIR ${meta_SOURCE_DIR}/../deps/libsvm-modules/libsvm BUILD_IN_SOURCE 1 CONFIGURE_COMMAND "" - BUILD_COMMAND make + BUILD_COMMAND CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER} make LOG_BUILD 0 INSTALL_COMMAND "") diff --git a/src/classify/classifier/svm_wrapper.cpp b/src/classify/classifier/svm_wrapper.cpp index 1c9569ec5..8ed7bd8b6 100644 --- a/src/classify/classifier/svm_wrapper.cpp +++ b/src/classify/classifier/svm_wrapper.cpp @@ -41,9 +41,20 @@ class_label svm_wrapper::classify(doc_id d_id) out.close(); // run liblinear/libsvm +#ifndef _WIN32 std::string command = svm_path_ + executable_ + "predict svm-input svm-train.model svm-predicted"; command += " > /dev/null 2>&1"; +#else + // first set of quotes is around the exe name to make things work without + // having to use forward slashes in the path name. + // + // second set of quotes is around the entire command, since Windows does + // strange things in making the command to actually be sent to CMD.exe + auto command = "\"\"" + svm_path_ + executable_ + + "predict.exe\" svm-input svm-train.model svm-predicted"; + command += " > NUL 2>&1\""; +#endif system(command.c_str()); // extract answer @@ -65,9 +76,16 @@ confusion_matrix svm_wrapper::test(const std::vector& docs) out.close(); // run liblinear/libsvm +#ifndef _WIN32 std::string command = svm_path_ + executable_ + "predict svm-input svm-train.model svm-predicted"; command += " > /dev/null 2>&1"; +#else + // see comment in classify() + auto command = "\"\"" + svm_path_ + executable_ + + "predict.exe\" svm-input svm-train.model svm-predicted"; + command += " > NUL 2>&1\""; +#endif system(command.c_str()); // extract answer @@ -96,9 +114,16 @@ void svm_wrapper::train(const std::vector& docs) out << idx_->liblinear_data(d_id) << "\n"; out.close(); +#ifndef _WIN32 std::string command = svm_path_ + executable_ + "train " + options_.at(kernel_) + " svm-train"; command += " > /dev/null 2>&1"; +#else + // see comment in classify() + auto command = "\"\"" + svm_path_ + executable_ + "train.exe\" " + + options_.at(kernel_) + " svm-train"; + command += " > NUL 2>&1\""; +#endif system(command.c_str()); } diff --git a/src/io/compressed_file_writer.cpp b/src/io/compressed_file_writer.cpp index 7b1e8ff05..9e204ca2c 100644 --- a/src/io/compressed_file_writer.cpp +++ b/src/io/compressed_file_writer.cpp @@ -15,7 +15,7 @@ namespace io compressed_file_writer::compressed_file_writer( const std::string& filename, std::function mapping) - : outfile_{fopen(filename.c_str(), "w")}, + : outfile_{fopen(filename.c_str(), "wb")}, char_cursor_{0}, bit_cursor_{0}, buffer_size_{1024 * 1024 * 64}, // 64 MB From 0809d3c0daf0d958f99fceebb9aca9035a83e7fd Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 6 May 2015 12:32:29 -0500 Subject: [PATCH 115/481] remove current LM implementation --- include/lm/language_model.h | 38 +------ src/lm/language_model.cpp | 218 +----------------------------------- 2 files changed, 3 insertions(+), 253 deletions(-) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 1a305dd36..83f14c1a4 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -13,7 +13,6 @@ #include #include #include -#include #include "cpptoml.h" #include "lm/sentence.h" @@ -30,13 +29,6 @@ class language_model */ language_model(const cpptoml::table& config); - /** - * Creates an N-gram language model based on the corpus specified in the - * config file. - * @param n The value of n, which overrides any setting in the config file - */ - language_model(const cpptoml::table& config, size_t n); - /** * Randomly generates one token sequence based on and symbols. * @return a random sequence of tokens based on this language model @@ -81,35 +73,7 @@ class language_model size_t k) const; private: - /** - * Builds the probabilities associated with this language model. - * @param config The config file that specifies the location of the - * corpus - */ - void learn_model(const cpptoml::table& config); - - /** - * @param config - */ - void select_method(const cpptoml::table& config); - - /** - * @param prefix Path to where the counts files are stored - */ - void read_precomputed(const std::string& prefix); - - /// The language_model used to interpolate with this one for smoothing - std::shared_ptr interp_; // shared to allow copying - - /// Contains the N-gram distribution probabilities (N-1 words -> (w, prob)) - std::unordered_map> - dist_; - - /// The value of N in this n-gram - size_t N_; - - /// The interpolation coefficient for smoothing LM probabilities - constexpr static double lambda_ = 0.7; + uint64_t N_; }; class language_model_exception : public std::runtime_error diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index c9e757f69..b54a53a21 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -10,10 +10,6 @@ #include #include #include -#include "analyzers/analyzer.h" -#include "analyzers/tokenizers/icu_tokenizer.h" -#include "analyzers/filters/all.h" -#include "corpus/corpus.h" #include "util/shim.h" #include "lm/language_model.h" @@ -25,157 +21,11 @@ namespace lm language_model::language_model(const cpptoml::table& config) { auto table = config.get_table("language-model"); - auto nval = table->get_as("n-value"); - if (!nval) - throw language_model_exception{ - "no n-value specified in language-model table"}; - - N_ = *nval; - - if (N_ > 1) - interp_ = std::make_shared(config, N_ - 1); - - select_method(config); -} - -void language_model::select_method(const cpptoml::table& config) -{ - auto table = config.get_table("language-model"); - auto format = table->get_as("format"); - if (!format) - throw language_model_exception{ - "no format specified in language-model table"}; - - if (*format == "precomputed") - { - auto prefix = table->get_as("prefix"); - if (!prefix) - throw language_model_exception{ - "no prefix specified for precomputed language model"}; - read_precomputed(*prefix); - } - else if (*format == "learn") - learn_model(config); - else - throw language_model_exception{ - "language-model format could not be determined"}; -} - -language_model::language_model(const cpptoml::table& config, size_t n) - : N_{n} -{ - if (N_ > 1) - interp_ = std::make_shared(config, N_ - 1); - - select_method(config); -} - -void language_model::learn_model(const cpptoml::table& config) -{ - std::cout << "Learning " << N_ << "-gram language model" << std::endl; - auto corpus = corpus::corpus::load(config); - - using namespace analyzers; - std::unique_ptr stream; - stream = make_unique(); - stream = make_unique(std::move(stream)); - stream = make_unique(std::move(stream)); - stream = make_unique(std::move(stream)); - - while (corpus->has_next()) - { - auto doc = corpus->next(); - stream->set_content(doc.content()); - - // get ngram stream started - sentence ngram; - for (size_t i = 1; i < N_; ++i) - ngram.push_back(""); - - // count each ngram occurrence - while (*stream) - { - auto token = stream->next(); - if (N_ > 1) - { - ++dist_[ngram.to_string()][token]; - ngram.pop_front(); - ngram.push_back(token); - } - else - ++dist_[""][token]; // unigram has no previous tokens - } - } - - // turn counts into probabilities - for (auto& map : dist_) - { - double sum = 0.0; - for (auto& end : map.second) - sum += end.second; - for (auto& end : map.second) - end.second /= sum; - } -} - -void language_model::read_precomputed(const std::string& prefix) -{ - std::cout << "Reading " << N_ << "-gram language model" << std::endl; - std::ifstream in{prefix + std::to_string(N_) + "-grams.txt"}; - std::string line; - uint64_t count; - while (in) - { - std::getline(in, line); - std::istringstream iss{line}; - iss >> count; - sentence ngram; - std::string token; - for (size_t i = 0; i < N_ - 1; ++i) - { - iss >> token; - ngram.push_back(token); - } - - // if there is one remaining token to read - if (iss) - { - iss >> token; - dist_[ngram.to_string()][token] = count; - } - else // if unigram - { - dist_[""][ngram.to_string()] = count; - } - } - - // turn counts into probabilities - for (auto& map : dist_) - { - double sum = 0.0; - for (auto& end : map.second) - sum += end.second; - for (auto& end : map.second) - end.second /= sum; - } } std::string language_model::next_token(const sentence& tokens, double random) const { - auto it = dist_.find(tokens.to_string()); - if (it == dist_.end()) - throw language_model_exception{"couldn't find previous n - 1 tokens: " - + tokens.to_string()}; - - double cur = 0.0; - for (auto& end : it->second) - { - cur += end.second; - if (cur > random) - return end.first; - } - throw language_model_exception{"could not generate next token: " + tokens.to_string()}; } @@ -183,80 +33,16 @@ std::string language_model::next_token(const sentence& tokens, std::vector> language_model::top_k(const sentence& prev, size_t k) const { - if (prev.size() != N_ - 1) - throw language_model_exception{"prev should contain n - 1 tokens"}; - - auto it = dist_.find(prev.to_string()); - if (it == dist_.end()) - throw language_model_exception{"no transitions found"}; - - using pair_t = std::pair; - std::vector probs{it->second.begin(), it->second.end()}; - - auto comp = [&](const pair_t& a, const pair_t& b) - { - return a.second > b.second; - }; - if (k >= probs.size()) - { - std::sort(probs.begin(), probs.end(), comp); - return probs; - } - - std::nth_element(probs.begin(), probs.begin() + k, probs.end(), comp); - std::vector sorted{probs.begin(), probs.begin() + k}; - std::sort(sorted.begin(), sorted.end(), comp); - - return sorted; } std::string language_model::generate(unsigned int seed) const { - std::default_random_engine gen(seed); - std::uniform_real_distribution rdist(0.0, 1.0); - - // start generating at the beginning of a sequence - sentence ngram; - for (size_t n = 1; n < N_; ++n) - ngram.push_back(""); - - // keep generating until we see - std::string output; - std::string next = next_token(ngram, rdist(gen)); - while (next != "") - { - if (ngram.front() != "") - output += " " + ngram.front(); - ngram.pop_front(); - ngram.push_back(next); - next = next_token(ngram, rdist(gen)); - } - - output += " " + ngram.to_string(); - return output; + return ""; } double language_model::prob(sentence tokens) const { - if (tokens.size() != N_) - throw language_model_exception{"prob() needs one N-gram"}; - - sentence interp_tokens{tokens}; - interp_tokens.pop_front(); // look at prev N - 1 - auto interp_prob = interp_ ? interp_->prob(interp_tokens) : 1.0; - - auto last = tokens.back(); - tokens.pop_back(); - - auto endings = dist_.find(tokens.to_string()); - if (endings == dist_.end()) - return (1.0 - lambda_) * interp_prob; - - auto prob = endings->second.find(last); - if (prob == endings->second.end()) - return (1.0 - lambda_) * interp_prob; - - return lambda_ * prob->second + (1.0 - lambda_) * interp_prob; + return 0.0; } double language_model::perplexity(const sentence& tokens) const From dad6f23ab3cda1d82c9de47872e3992d43cc1683 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 7 May 2015 19:31:41 -0500 Subject: [PATCH 116/481] draft .arpa reader for language model --- include/lm/language_model.h | 45 +++++++++++--- include/lm/sentence.h | 3 +- src/lm/diff.cpp | 2 +- src/lm/language_model.cpp | 109 +++++++++++++++++++++++++++----- src/lm/sentence.cpp | 121 +++++++++++++++++++++++++----------- src/lm/tools/lm-test.cpp | 18 +++++- 6 files changed, 236 insertions(+), 62 deletions(-) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 83f14c1a4..ca96ff0e2 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -12,6 +12,7 @@ #include #include +#include #include #include "cpptoml.h" #include "lm/sentence.h" @@ -48,32 +49,60 @@ class language_model * model: \f$ \sqrt[n]{\prod_{i=1}^n\frac{1}{p(w_i|w_{i-n}\cdots w_{i-1})}} * \f$ */ - double perplexity(const sentence& tokens) const; + float perplexity(const sentence& tokens) const; /** * @param sentence A sequence of tokens * @return the perplexity of this token sequence given the current language * model normalized by the length of the sequence */ - double perplexity_per_word(const sentence& tokens) const; + float perplexity_per_word(const sentence& tokens) const; /** - * @param tokens A sequence of n tokens - * @return the probability of seeing the nth token based on the previous n - * - 1 tokens + * @param tokens A sequence of n tokens (one sentence) + * @return the log probability of the likelihood of this sentence */ - double prob(sentence tokens) const; + float log_prob(sentence tokens) const; /** * @param prev Seen tokens to base the next token off of * @param k Number of results to return * @return a sorted vector of likely next tokens */ - std::vector> top_k(const sentence& prev, + std::vector> top_k(const sentence& prev, size_t k) const; private: - uint64_t N_; + /** + * Reads precomputed LM data into this object. + * @param arpa_file The path to the ARPA-formatted file + */ + void read_arpa_format(const std::string& arpa_file); + + /** + * @param tokens + * @return the log probability of one ngram + */ + float prob_calc(sentence tokens) const; + + uint64_t N_; /// The "n" value for this n-gram language model + + /** + * Simple struct to keep track of probabilities and backoff values. + */ + struct lm_node + { + lm_node(): + prob{0.0f}, backoff{0.0f} {} + + lm_node(float p, float b): + prob{p}, backoff{b} {} + + float prob; + float backoff; + }; + + std::unordered_map lm_; }; class language_model_exception : public std::runtime_error diff --git a/include/lm/sentence.h b/include/lm/sentence.h index 35a9d4c22..c32ea4849 100644 --- a/include/lm/sentence.h +++ b/include/lm/sentence.h @@ -31,8 +31,9 @@ class sentence * Creates a sentence based on a text string, parsed with the default filter * chain. * @param text + * @param tokenize Whether or not to tokenize the input sentence */ - sentence(const std::string& text); + sentence(const std::string& text, bool tokenize = true); /** * @return a string representation of this sentence diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 838f9d4a5..45c248727 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -95,7 +95,7 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) for (uint64_t i = n_val_ - 1; i < sent.size(); ++i) { auto ngram = sent(i - (n_val_ - 1), i + 1); - auto prob = lm_.prob(ngram); + auto prob = lm_.log_prob(ngram); if (prob < min_prob) { min_prob = prob; diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index b54a53a21..bf8c592c2 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -21,6 +21,44 @@ namespace lm language_model::language_model(const cpptoml::table& config) { auto table = config.get_table("language-model"); + auto arpa_file = table->get_as("arpa-file"); + read_arpa_format(*arpa_file); +} + +void language_model::read_arpa_format(const std::string& arpa_file) +{ + std::ifstream infile{arpa_file}; + std::string buffer; + + // get to beginning of unigram data + while (std::getline(infile, buffer)) + { + if (buffer.find("\\1-grams:") == 0) + break; + } + + N_ = 0; + + while (std::getline(infile, buffer)) + { + if (buffer.empty()) + continue; + + if (buffer[0] == '\\') + { + ++N_; + continue; + } + + auto first_tab = buffer.find_first_of('\t'); + float prob = std::stof(buffer.substr(0, first_tab)); + auto second_tab = buffer.find_first_of('\t', first_tab + 1); + auto ngram = buffer.substr(first_tab + 1, second_tab - first_tab - 1); + float backoff = 0.0; + if (second_tab != std::string::npos) + backoff = std::stof(buffer.substr(second_tab + 1)); + lm_[ngram] = {prob, backoff}; + } } std::string language_model::next_token(const sentence& tokens, @@ -30,7 +68,7 @@ std::string language_model::next_token(const sentence& tokens, + tokens.to_string()}; } -std::vector> +std::vector> language_model::top_k(const sentence& prev, size_t k) const { } @@ -40,34 +78,77 @@ std::string language_model::generate(unsigned int seed) const return ""; } -double language_model::prob(sentence tokens) const +float language_model::prob_calc(sentence tokens) const { - return 0.0; + if (tokens.size() == 1) + { + auto it = lm_.find(tokens[0]); + if (it != lm_.end()) + return it->second.prob; + return lm_.at("").prob; + } + else + { + auto it = lm_.find(tokens.to_string()); + if (it != lm_.end()) + return it->second.prob; + + auto hist = tokens(0, tokens.size() - 1); + tokens.pop_front(); + if (tokens.size() == 1) + { + hist = hist(0, 1); + auto it = lm_.find(hist[0]); + if (it == lm_.end()) + hist.substitute(0, ""); + } + + it = lm_.find(hist.to_string()); + if (it != lm_.end()) + return it->second.backoff + prob_calc(tokens); + return prob_calc(tokens); + } } -double language_model::perplexity(const sentence& tokens) const +float language_model::log_prob(sentence tokens) const { + tokens.push_front(""); + tokens.push_back(""); + float prob = 0.0f; + + // tokens < N sentence ngram; - for (size_t i = 1; i < N_; ++i) - ngram.push_back(""); + for (uint64_t i = 0; i < N_ - 1; ++i) + { + ngram.push_back(tokens[i]); + prob += prob_calc(ngram); + } - double perp = 0.0; - for (auto& token : tokens) + // tokens >= N + for (uint64_t i = N_ - 1; i < tokens.size(); ++i) { - ngram.push_back(token); - perp += std::log(1.0 / prob(ngram)); + ngram.push_back(tokens[i]); + prob += prob_calc(ngram); ngram.pop_front(); } - return perp / N_; + return prob; +} + +float language_model::perplexity(const sentence& tokens) const +{ + if (tokens.size() == 0) + throw language_model_exception{"perplexity() called on empty sentence"}; + return std::pow( + 10.0, -(log_prob(tokens) / (tokens.size() + 2))); // +2 for and } -double language_model::perplexity_per_word(const sentence& tokens) const +float language_model::perplexity_per_word(const sentence& tokens) const { if (tokens.size() == 0) throw language_model_exception{ - "perplexity_per_word called on empty sentence"}; - return perplexity(tokens) / tokens.size(); + "perplexity_per_word() called on empty sentence"}; + return perplexity(tokens) / (tokens.size() + 2); // +2 for and } } } diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 887928352..d76d1c540 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -4,38 +4,50 @@ */ #include - +#include #include #include "lm/sentence.h" #include "analyzers/analyzer.h" #include "analyzers/tokenizers/icu_tokenizer.h" +#include "analyzers/tokenizers/whitespace_tokenizer.h" #include "analyzers/filters/all.h" namespace meta { namespace lm { -sentence::sentence(const std::string& text) +sentence::sentence(const std::string& text, bool tokenize /* = true */) { - using namespace analyzers; - std::unique_ptr stream; - stream = make_unique(); - stream = make_unique(std::move(stream)); - stream = make_unique(std::move(stream)); - stream = make_unique(std::move(stream)); - stream->set_content(text); - while (*stream) - tokens_.push_back(stream->next()); - - if (tokens_.empty()) - throw sentence_exception{"empty token stream"}; - - // remove sentence markers - tokens_.pop_front(); - tokens_.pop_back(); - - if (tokens_.empty()) - throw sentence_exception{"empty token stream"}; + if (tokenize) + { + using namespace analyzers; + std::unique_ptr stream; + stream = make_unique(); + stream = make_unique(); + stream = make_unique(std::move(stream)); + stream = make_unique(std::move(stream)); + stream = make_unique(std::move(stream)); + stream->set_content(text); + while (*stream) + tokens_.push_back(stream->next()); + + if (tokens_.empty()) + throw sentence_exception{"empty token stream"}; + + // remove sentence markers + tokens_.pop_front(); + tokens_.pop_back(); + + if (tokens_.empty()) + throw sentence_exception{"empty token stream"}; + } + else + { + std::istringstream iss{text}; + std::copy(std::istream_iterator(iss), + std::istream_iterator(), + std::back_inserter(tokens_)); + } } std::string sentence::to_string() const @@ -70,7 +82,7 @@ sentence sentence::operator()(size_type from, size_type to) const void sentence::substitute(size_type idx, const std::string& token, double weight /* = 0.0 */) { - //ops_.push_back("substitute(" + std::to_string(idx) + ", " + tokens_[idx] + // ops_.push_back("substitute(" + std::to_string(idx) + ", " + tokens_[idx] // + " -> " + token + ")"); ops_.push_back("substitute(" + tokens_[idx] + " -> " + token + ")"); tokens_[idx] = token; @@ -79,7 +91,8 @@ void sentence::substitute(size_type idx, const std::string& token, void sentence::remove(size_type idx, double weight /* = 0.0 */) { - //ops_.push_back("remove(" + std::to_string(idx) + ", " + (*this)[idx] + ")"); + // ops_.push_back("remove(" + std::to_string(idx) + ", " + (*this)[idx] + + // ")"); ops_.push_back("remove(" + (*this)[idx] + ")"); tokens_.erase(tokens_.begin() + idx); weights_.push_back(weight); @@ -89,7 +102,7 @@ void sentence::insert(size_type idx, const std::string& token, double weight /* = 0.0 */) { tokens_.insert(tokens_.begin() + idx, token); - //ops_.push_back("insert(" + std::to_string(idx) + ", " + token + ")"); + // ops_.push_back("insert(" + std::to_string(idx) + ", " + token + ")"); ops_.push_back("insert(" + token + ")"); weights_.push_back(weight); } @@ -102,33 +115,69 @@ double sentence::average_weight() const return sum / weights_.size(); } -std::vector sentence::weights() const { return weights_; } +std::vector sentence::weights() const +{ + return weights_; +} -const std::vector& sentence::operations() const { return ops_; } +const std::vector& sentence::operations() const +{ + return ops_; +} -std::string sentence::front() const { return tokens_.front(); } +std::string sentence::front() const +{ + return tokens_.front(); +} -std::string sentence::back() const { return tokens_.back(); } +std::string sentence::back() const +{ + return tokens_.back(); +} void sentence::push_front(const std::string& token) { tokens_.push_front(token); } -void sentence::pop_front() { tokens_.pop_front(); } +void sentence::pop_front() +{ + tokens_.pop_front(); +} -void sentence::push_back(const std::string& token) { tokens_.push_back(token); } +void sentence::push_back(const std::string& token) +{ + tokens_.push_back(token); +} -void sentence::pop_back() { tokens_.pop_back(); } +void sentence::pop_back() +{ + tokens_.pop_back(); +} -sentence::iterator sentence::begin() { return tokens_.begin(); } +sentence::iterator sentence::begin() +{ + return tokens_.begin(); +} -sentence::iterator sentence::end() { return tokens_.end(); } +sentence::iterator sentence::end() +{ + return tokens_.end(); +} -sentence::const_iterator sentence::begin() const { return tokens_.cbegin(); } +sentence::const_iterator sentence::begin() const +{ + return tokens_.cbegin(); +} -sentence::const_iterator sentence::end() const { return tokens_.cend(); } +sentence::const_iterator sentence::end() const +{ + return tokens_.cend(); +} -sentence::size_type sentence::size() const { return tokens_.size(); } +sentence::size_type sentence::size() const +{ + return tokens_.size(); +} } } diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index 0f7513f0b..4c07df0a5 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -8,6 +8,7 @@ #include "meta.h" #include "lm/diff.h" #include "lm/sentence.h" +#include "lm/language_model.h" #include "logging/logger.h" #include "util/progress.h" #include "util/filesystem.h" @@ -16,6 +17,20 @@ using namespace meta; int main(int argc, char* argv[]) { + logging::set_cerr_logging(); + lm::language_model model{cpptoml::parse_file(argv[1])}; + lm::sentence s1{"I disagree with this statement for several reasons .", + false}; + std::cout << s1.to_string() << ": " << model.log_prob(s1) << std::endl; + lm::sentence s2{"I disagree with this octopus for several reasons .", + false}; + std::cout << s2.to_string() << ": " << model.log_prob(s2) << std::endl; + lm::sentence s3{"Hello world !", false}; + std::cout << s3.to_string() << ": " << model.log_prob(s3) << std::endl; + lm::sentence s4{"xyz xyz xyz", false}; + std::cout << s4.to_string() << ": " << model.log_prob(s4) << std::endl; + + /* if (argc != 3) { std::cerr << "Usage: " << argv[0] << " config.toml sentences.txt" @@ -23,8 +38,6 @@ int main(int argc, char* argv[]) return 1; } - logging::set_cerr_logging(); - lm::diff correcter{cpptoml::parse_file(argv[1])}; std::ifstream in{argv[2]}; auto num_sentences = filesystem::num_lines(argv[2]); @@ -65,4 +78,5 @@ int main(int argc, char* argv[]) prog.end(); std::cout << "Percent no-ops: " << do_nothing / done << std::endl; + */ } From 192c7afd257e0b6b88cea2ada76cfb31fdad59e4 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 7 May 2015 19:59:31 -0500 Subject: [PATCH 117/481] language model unit tests --- data/english-sentences.arpa | 33246 +++++++++++++++++++++++++++++ include/test/lm_test.h | 31 + src/lm/tools/lm-test.cpp | 17 +- src/test/CMakeLists.txt | 1 + src/test/inverted_index_test.cpp | 3 +- src/test/lm_test.cpp | 41 + src/test/tools/unit-test.cpp | 4 + src/test/unit_tests.cmake | 4 + 8 files changed, 33331 insertions(+), 16 deletions(-) create mode 100644 data/english-sentences.arpa create mode 100644 include/test/lm_test.h create mode 100644 src/test/lm_test.cpp diff --git a/data/english-sentences.arpa b/data/english-sentences.arpa new file mode 100644 index 000000000..c4af4d106 --- /dev/null +++ b/data/english-sentences.arpa @@ -0,0 +1,33246 @@ +\data\ +ngram 1=2768 +ngram 2=12182 +ngram 3=18284 + +\1-grams: +-4.1183567 0 +0 -0.63579637 +-3.4271748 0 +-2.4996314 I -0.2709696 +-3.8506508 disagree -0.14086568 +-2.174265 with -0.2945207 +-2.511044 this -0.16834387 +-3.5348082 statement -0.14589529 +-1.9226898 for -0.3749427 +-3.153469 several -0.18439111 +-2.8938322 reasons -0.3760948 +-1.426225 . -2.5361726 +-3.9746375 While -0.098042734 +-2.219103 it -0.3107041 +-2.269109 can -0.31470096 +-2.5472004 be -0.18867591 +-3.9746375 argued -0.098042734 +-1.9633206 that -0.36922604 +-2.8413033 having -0.38809296 +-1.8339452 a -0.33833933 +-2.4177468 part-time -0.55441916 +-2.645509 job -0.26853126 +-1.9732625 is -0.3179428 +-2.8938322 valuable -0.14450517 +-3.5348082 preparation -0.47831634 +-3.340989 full-time -0.098042734 +-1.7727894 the -0.2817252 +-2.7136247 student -0.2349929 +-2.3569138 will -0.2810348 +-3.678229 acquire -0.098042734 +-3.1054616 upon -0.2741496 +-3.340989 graduation -0.2548424 +-1.4526992 , -0.46682543 +-2.866774 often -0.12548718 +-2.4085171 not -0.22340627 +-3.5348082 case -0.13342881 +-3.8506508 In -0.10741108 +-2.7136247 many -0.17417175 +-3.2691073 cases -0.17496733 +-2.817244 any -0.16434178 +-2.9536004 employment -0.23850599 +-3.678229 obtained -0.17195599 +-2.4177468 by -0.24772288 +-1.7878366 in -0.41475922 +-2.8938322 field -0.51152325 +-1.6439362 and -0.21729526 +-1.6765372 of -0.36378208 +-3.4271748 nature -0.14086568 +-3.340989 completely -0.098042734 +-3.678229 unrelated -0.17195599 +-1.6063511 to -0.40661365 +-2.0663714 their -0.2614526 +-3.678229 chosen -0.1281937 +-3.2074509 course -0.15613267 +-2.645509 study -0.22778022 +-3.9746375 For -0.098042734 +-3.8506508 example -0.13342881 +-2.7727888 studying -0.26001975 +-3.5348082 engineering -0.098042734 +-2.6955683 has -0.27701992 +-2.817244 very -0.16373841 +-3.2074509 little -0.098042734 +-2.511044 experience -0.28569293 +-3.340989 gain -0.16288683 +-2.3035645 from -0.26378733 +-2.427177 working -0.31814906 +-2.179623 as -0.33811408 +-3.9746375 waiter -0.098042734 +-3.2691073 restaurant -0.14784537 +-3.9746375 Moreover -0.098042734 +-2.817244 there -0.33824086 +-3.8506508 ample -0.098042734 +-2.9536004 reason -0.23492767 +-2.1637433 students -0.45364022 +-3.678229 engage -0.35337758 +-3.9746375 Such -0.098042734 +-3.8506508 mean -0.098042734 +-3.678229 considerable -0.098042734 +-3.340989 amount -0.6544076 +-3.340989 additional -0.20720947 +-3.340989 stress -0.13342881 +-2.174265 on -0.36171323 +-3.8506508 top -0.21423665 +-3.9746375 created -0.098042734 +-2.9536004 studies -0.27414817 +-3.9746375 The -0.098042734 +-2.6150792 also -0.1678165 +-3.9746375 increased -0.098042734 +-3.0229268 less -0.12430835 +-2.3569138 time -0.35682118 +-3.1054616 available -0.2964432 +-3.9746375 accomplish -0.098042734 +-3.9746375 Additionally -0.098042734 +-2.6300275 jobs -0.24673419 +-3.5348082 leave -0.098042734 +-2.559954 one -0.2572041 +-3.4271748 tired -0.14086568 +-3.8506508 thus -0.098042734 +-3.5348082 unable -0.47831634 +-3.153469 focus -0.19507152 +-3.8506508 effectively -0.098042734 +-2.678233 ? -1.008282 +-3.8506508 resulting -0.098042734 +-3.8506508 lower -0.098042734 +-3.5348082 quality -0.1522653 +-2.7324646 school -0.27006212 +-2.2560573 work -0.34280708 +-3.0622365 being -0.12611718 +-3.9746375 produced -0.098042734 +-3.9746375 Therefore -0.098042734 +-2.7136247 if -0.33696187 +-3.0622365 related -0.57519233 +-3.2074509 f -0.098042734 +-3.8506508 found -0.098042734 +-2.678233 -LRB- -0.12223055 +-2.6150792 such -0.35941598 +-2.5472004 an -0.14576791 +-3.9746375 internship -0.098042734 +-2.645509 -RRB- -0.27277613 +-2.9868817 then -0.11401739 +-2.5866425 may -0.2907103 +-3.9746375 worth -0.098042734 +-3.4271748 pursuing -0.098042734 +-2.9536004 but -0.17043503 +-3.8506508 otherwise -0.098042734 +-2.4885108 should -0.35904452 +-2.794448 only -0.15665703 +-3.9746375 sought -0.098042734 +-3.153469 necessary -0.19507152 +-2.922689 financial -0.13703011 +-3.9746375 i.e. -0.098042734 +-3.2074509 pay -0.22503476 +-3.153469 tuition -0.18922512 +-3.340989 fees -0.22739327 +-3.9746375 Yes -0.098042734 +-3.340989 agree -0.33653778 +-2.269109 college -0.44256377 +-2.4085171 have -0.30732065 +-3.9746375 Time -0.098042734 +-2.559954 money -0.36377907 +-2.2825649 they -0.39242604 +-3.340989 say -0.20720947 +-3.9746375 And -0.098042764 +-2.7727888 you -0.17147988 +-3.1054616 extra -0.271326 +-3.2691073 making -0.18983142 +-2.6150792 some -0.23380876 +-2.661563 so -0.20789708 +-3.153469 bad -0.14589529 +-3.5348082 following -0.12131027 +-3.0229268 : -0.11285661 +-3.5348082 - -0.098042734 +-3.678229 To -0.115375474 +-3.4271748 begin -0.20720947 +-3.2691073 today -0.12430835 +-3.153469 's -0.116145 +-3.8506508 market -0.098042734 +-3.1054616 become -0.12322667 +-3.5348082 highly -0.098042734 +-3.8506508 competitive -0.098042734 +-2.817244 most -0.1802425 +-3.340989 degree -0.12131027 +-3.2691073 does -0.1281937 +-3.8506508 guarantee -0.098042734 +-2.2017417 or -0.1551769 +-3.8506508 pays -0.098042734 +-2.8413033 well -0.22909641 +-3.9746375 Most -0.098042734 +-3.9746375 advertisements -0.098042734 +-3.8506508 appear -0.098042734 +-3.9746375 newspapers -0.098042734 +-3.9746375 on-line -0.098042734 +-3.9746375 portals -0.098042734 +-3.8506508 preferred -0.098042734 +-3.8506508 candidates -0.098042734 +-2.8413033 even -0.14589529 +-3.9746375 entry -0.098042734 +-3.340989 level -0.20720947 +-3.340989 positions -0.1281937 +-2.2625341 are -0.2012413 +-3.8506508 As -0.14086568 +-3.5348082 ways -0.098042734 +-3.0622365 career -0.22583756 +-3.9746375 oriented -0.098042734 +-3.5348082 internships -0.098042734 +-3.4271748 practical -0.098042734 +-3.2074509 experiences -0.17723958 +-3.9746375 Another -0.098042734 +-3.9746375 option -0.098042734 +-3.9746375 Part-time -0.098042734 +-3.2074509 offer -0.12131027 +-3.2074509 benefits -0.12430835 +-3.9746375 Primarily -0.098042734 +-3.2691073 benefit -0.14589529 +-3.9746375 Apart -0.098042734 +-2.559954 part -0.73008823 +-3.9746375 lays -0.098042734 +-3.8506508 foundation -0.17195599 +-3.8506508 history -0.098042734 +-3.0622365 future -0.20006908 +-3.5348082 carefully -0.098042734 +-3.4271748 consider -0.1281937 +-3.9746375 select -0.098042734 +-3.8506508 order -0.21423665 +-3.5348082 skill -0.1522653 +-3.8506508 By -0.12430835 +-3.9746375 selecting -0.098042734 +-3.9746375 gaps -0.098042734 +-3.0622365 academic -0.13560295 +-3.8506508 qualification -0.098042734 +-3.9746375 minimized -0.098042734 +-2.8938322 ; -0.16036421 +-3.2074509 makes -0.26010817 +-3.8506508 candidate -0.098042734 +-2.4670892 more -0.15903787 +-3.9746375 attractive -0.098042734 +-3.340989 potential -0.098042734 +-3.1054616 employers -0.118926615 +-3.2691073 helps -0.1522653 +-2.9536004 better -0.116986044 +-3.2691073 understanding -0.30636662 +-3.4271748 whether -0.13342881 +-3.9746375 corporate -0.098042734 +-3.5348082 organization -0.098042734 +-3.8506508 hotel -0.098042734 +-3.678229 industry -0.098042734 +-3.9746375 evolve -0.098042734 +-2.5348108 skills -0.3170925 +-2.678233 important -0.3392031 +-3.678229 employer -0.098042734 +-3.9746375 These -0.098042734 +-3.678229 include -0.098042734 +-3.8506508 leadership -0.21423665 +-3.8506508 commitment -0.21423665 +-3.8506508 team -0.098042734 +-3.9746375 spirit -0.098042734 +-3.8506508 interpersonal -0.098042734 +-3.5348082 management -0.18983142 +-3.1054616 taking -0.24834287 +-3.9746375 criticism -0.098042734 +-3.9746375 positively -0.098042734 +-3.9746375 Besides -0.098042734 +-3.2691073 knowledge -0.24782339 +-3.678229 gained -0.098042734 +-2.866774 through -0.1740617 +-3.340989 educational -0.098042734 +-2.7136247 who -0.23004654 +-3.340989 productive -0.098042734 +-3.4271748 developed -0.098042734 +-3.8506508 overall -0.098042734 +-3.678229 personality -0.17195599 +-3.153469 able -0.8304989 +-3.9746375 assimilate -0.098042734 +-2.866774 themselves -0.16573042 +-2.7136247 into -0.24834287 +-3.0622365 environment -0.098042734 +-3.340989 learned -0.12430835 +-3.340989 position -0.17723958 +-3.9746375 expand -0.098042734 +-3.5348082 increase -0.1522653 +-3.678229 abilities -0.098042734 +-2.3035645 at -0.35974315 +-3.8506508 same -0.098042734 +-3.9746375 benefiting -0.098042734 +-3.2074509 financially -0.14589529 +-3.9746375 There -0.098042734 +-3.2691073 two -0.33214822 +-2.9868817 first -0.1447969 +-3.340989 issue -0.18983142 +-3.8506508 Many -0.115375474 +-3.9746375 complain -0.098042734 +-3.4271748 don -0.45026425 +-3.153469 ft -0.12271854 +-2.866774 enough -0.19867742 +-2.9868817 spend -0.2253515 +-3.2691073 food -0.14920501 +-3.4271748 times -0.14086568 +-3.9746375 Having -0.098042734 +-3.2074509 gives -0.20895563 +-2.866774 need -0.30885217 +-3.9746375 emergency -0.098042734 +-3.9746375 Saving -0.098042734 +-3.678229 earned -0.14086568 +-2.8938322 good -0.11466875 +-3.153469 idea -0.34725833 +-3.9746375 alternative -0.098042734 +-3.9746375 asking -0.098042734 +-3.2074509 others -0.17195599 +-3.0229268 like -0.13847902 +-2.8938322 parents -0.19664675 +-3.1054616 friends -0.28161559 +-3.9746375 However -0.098042734 +-2.922689 doing -0.17161582 +-3.678229 again -0.098042734 +-3.4271748 put -0.098042734 +-3.9746375 strains -0.098042734 +-3.4271748 relationships -0.1605951 +-2.522765 people -0.20933177 +-3.8506508 borrowed -0.21423665 +-3.9746375 They -0.098042734 +-3.4271748 eventually -0.098042734 +-3.340989 see -0.098042764 +-3.9746375 evampire -0.098042734 +-3.9746375 f. -0.098042734 +-3.8506508 This -0.10566091 +-3.340989 especially -0.13342881 +-3.340989 true -0.098042734 +-3.5348082 College -0.14589529 +-3.678229 show -0.098042734 +-3.678229 fre -0.098042734 +-3.8506508 putting -0.098042734 +-3.5348082 effort -0.098042734 +-3.8506508 keeping -0.098042734 +-3.8506508 stable -0.098042734 +-3.9746375 People -0.098042734 +-3.8506508 willing -0.21423665 +-2.9868817 help -0.23529968 +-2.511044 when -0.33111084 +-3.8506508 trying -0.21423665 +-2.7324646 do -0.258052 +-3.2691073 something -0.2100067 +-3.0229268 your -0.14409229 +-3.4271748 earn -0.14589529 +-2.9868817 other -0.12005409 +-2.8413033 while -0.3406869 +-3.9746375 Of -0.098042734 +-3.5348082 major -0.098042734 +-3.9746375 advantage -0.098042734 +-3.8506508 teaching -0.098042734 +-3.0229268 take -0.14324087 +-3.9746375 tutoring -0.098042734 +-2.9868817 high -0.25890478 +-3.4271748 chance -0.20720947 +-3.153469 use -0.14086568 +-3.9746375 theories -0.098042734 +-3.9746375 practices -0.098042734 +-2.922689 classes -0.19090188 +-3.678229 fll -0.098042734 +-3.2074509 know -0.13679342 +-2.661563 what -0.21927345 +-3.8506508 works -0.098042734 +-3.9746375 doesn -0.098042734 +-3.678229 ft. -0.35337758 +-2.7136247 could -0.19622515 +-3.8506508 expanding -0.21423665 +-3.9746375 horizons -0.098042734 +-3.8506508 increasing -0.098042734 +-3.678229 opportunities -0.098042734 +-2.7727888 after -0.42475128 +-3.9746375 conclusion -0.098042734 +-3.5348082 encourage -0.098042734 +-2.4996314 all -0.22485438 +-2.7521589 because -0.31599462 +-3.2074509 opportunity -0.37108302 +-2.511044 them -0.2832598 +-3.9746375 present -0.098042734 +-3.9746375 situations -0.098042734 +-3.9746375 futures -0.098042734 +-3.1054616 Japanese -0.22962537 +-3.678229 distractions -0.17195599 +-3.2074509 real -0.13060203 +-3.4271748 therefore -0.098042734 +-3.0622365 provide -0.15665703 +-3.153469 useful -0.14589529 +-2.9868817 society -0.23463899 +-3.8506508 specialized -0.098042734 +-3.9746375 Workers -0.098042734 +-3.678229 expected -0.35337758 +-3.9746375 function -0.098042734 +-3.9746375 broad -0.098042734 +-3.8506508 range -0.21423665 +-3.9746375 contexts -0.098042734 +-3.9746375 Evaluations -0.098042734 +-2.8938322 must -0.2078073 +-3.9746375 familiar -0.098042734 +-3.340989 done -0.12430835 +-3.678229 area -0.1522653 +-3.9746375 specialty -0.098042734 +-3.9746375 If -0.098042734 +-3.4271748 n't -0.098042734 +-3.4271748 whatever -0.14086568 +-3.340989 might -0.13342881 +-3.8506508 produce -0.098042734 +-3.678229 fit -0.1522653 +-3.9746375 Neither -0.098042734 +-3.8506508 nor -0.098042734 +-3.9746375 Learning -0.098042734 +-3.4271748 itself -0.098042734 +-3.9746375 Because -0.098042734 +-3.9746375 external -0.098042734 +-3.8506508 subject -0.098042734 +-3.8506508 interfere -0.21423665 +-3.5348082 grades -0.14086568 +-3.9746375 specialization -0.098042734 +-3.9746375 It -0.098042734 +-3.340989 usually -0.098042734 +-3.678229 menial -0.098042734 +-3.9746375 specialist -0.098042734 +-3.9746375 useless -0.098042734 +-2.8413033 get -0.23702982 +-3.8506508 fields -0.098042734 +-3.9746375 Internships -0.098042734 +-3.678229 relevant -0.098042734 +-3.5348082 another -0.098042734 +-3.8506508 source -0.21423665 +-3.8506508 temptation -0.21423665 +-3.4271748 colleges -0.098042734 +-3.340989 club -0.13342881 +-2.9536004 activities -0.16798685 +-3.9746375 socialize -0.098042734 +-3.8506508 sufficient -0.098042734 +-3.2074509 teach -0.098042734 +-2.8938322 how -0.32529625 +-3.9746375 behave -0.098042734 +-3.2691073 away -0.5172111 +-3.2691073 give -0.16397229 +-3.9746375 unstructured -0.098042734 +-2.7324646 social -0.17316 +-3.9746375 unpredictable -0.098042734 +-3.9746375 results -0.098042734 +-3.9746375 unwilling -0.098042734 +-3.5348082 fellow -0.274173 +-3.8506508 teachers -0.098042734 +-3.9746375 unworthy -0.098042734 +-3.9746375 finely -0.098042734 +-3.9746375 honed -0.098042734 +-3.2691073 workers -0.098042764 +-3.678229 demands -0.098042734 +-3.8506508 strongly -0.098042734 +-3.2691073 believe -0.24114136 +-2.794448 fs -0.12210803 +-3.678229 main -0.14589529 +-3.9746375 lot -0.098042734 +-2.559954 life -0.31104797 +-3.2691073 entering -0.17723958 +-3.9746375 workforce -0.098042734 +-3.678229 prior -0.17195599 +-3.9746375 recently-graduated -0.098042734 +-3.9746375 woefully -0.098042734 +-3.9746375 unprepared -0.098042734 +-3.8506508 realities -0.098042734 +-3.153469 responsibilities -0.14784537 +-3.8506508 particularly -0.098042734 +-2.8938322 university -0.21144658 +-3.340989 used -0.20720947 +-3.678229 staying -0.098042734 +-2.7521589 up -0.19388978 +-3.340989 late -0.098042734 +-3.8506508 night -0.21423665 +-3.678229 sleeping -0.098042734 +-3.9746375 neglecting -0.098042734 +-3.8506508 commitments -0.098042734 +-3.0229268 no -0.12131027 +-3.4271748 immediate -0.098042734 +-3.9746375 repercussions -0.098042734 +-3.5348082 result -0.14086568 +-3.2074509 difficult -0.3447334 +-3.153469 make -0.13503471 +-3.5348082 lifestyle -0.098042734 +-3.9746375 adjustment -0.098042734 +-3.9746375 subsequent -0.098042734 +-3.8506508 A -0.11285661 +-3.9746375 undertaken -0.098042734 +-3.2691073 still -0.098042734 +-3.4271748 quite -0.098042764 +-3.678229 helpful -0.17195599 +-3.678229 regard -0.1522653 +-3.9746375 Maintaining -0.098042734 +-3.9746375 exercise -0.098042734 +-3.153469 personal -0.098042734 +-3.8506508 recent -0.21423665 +-3.340989 graduates -0.1281937 +-3.8506508 similarly -0.098042734 +-3.9746375 difficulty -0.098042734 +-3.5348082 managing -0.23276964 +-3.2074509 finances -0.14589529 +-3.8506508 responsibly -0.098042734 +-3.9746375 western -0.098042734 +-3.678229 countries -0.17195599 +-3.8506508 unfortunate -0.098042734 +-3.8506508 reality -0.098042734 +-3.9746375 combines -0.098042734 +-3.8506508 levels -0.098042734 +-3.4271748 debt -0.14086568 +-3.4271748 credit -0.45026425 +-3.8506508 cards -0.13342881 +-3.340989 loans -0.13342881 +-3.8506508 etc. -0.17195599 +-3.9746375 grave -0.098042734 +-3.9746375 peril -0.098042734 +-3.8506508 providing -0.098042734 +-3.5348082 cash -0.098042734 +-3.9746375 flow -0.098042734 +-3.8506508 least -0.098042734 +-3.8506508 small -0.098042734 +-2.9868817 income -0.1281937 +-2.6300275 which -0.26262486 +-3.9746375 counter -0.098042734 +-3.9746375 debts -0.098042734 +-3.8506508 largely -0.098042734 +-3.9746375 Firstly -0.098042734 +-3.5348082 been -0.098042734 +-3.8506508 myself -0.098042734 +-3.8506508 am -0.098042764 +-3.678229 aware -0.098042734 +-3.678229 pressures -0.098042734 +-3.9746375 generated -0.098042734 +-2.9536004 living -0.38954097 +-3.5348082 expenses -0.098042734 +-3.9746375 Often -0.098042734 +-2.6150792 these -0.16411746 +-3.9746375 met -0.098042734 +-3.9746375 combination -0.098042734 +-3.9746375 grants -0.098042734 +-3.0622365 family -0.15366085 +-3.8506508 savings -0.098042734 +-3.9746375 restrict -0.098042734 +-2.8938322 during -0.29814366 +-3.8506508 crucial -0.098042734 +-3.5348082 period -0.14086568 +-3.340989 adult -0.098042734 +-3.4271748 development -0.29227865 +-3.2691073 earning -0.1522653 +-3.9746375 constructive -0.098042734 +-2.9868817 way -0.23310995 +-3.9746375 easing -0.098042734 +-3.678229 allowing -0.098042734 +-3.4271748 lead -0.37145987 +-3.5348082 independent -0.1605951 +-3.8506508 Secondly -0.21423665 +-3.9746375 regards -0.098042734 +-3.9746375 prospects -0.098042734 +-3.8506508 rarely -0.098042734 +-3.340989 factor -0.098042734 +-3.8506508 majority -0.21423665 +-3.4271748 graduate -0.12430835 +-3.153469 professional -0.12131027 +-3.9746375 transferable -0.098042734 +-3.5348082 ability -0.14086568 +-3.9746375 communicate -0.098042734 +-3.0622365 customers -0.17496733 +-3.2691073 responsible -0.17723958 +-3.678229 almost -0.098042734 +-3.9746375 achievement -0.098042734 +-3.9746375 Not -0.098042734 +-3.9746375 demonstrating -0.098042734 +-3.9746375 attributes -0.098042734 +-3.678229 aspects -0.35337758 +-3.4271748 enjoy -0.13342881 +-3.9746375 expect -0.098042734 +-3.2691073 once -0.2741563 +-3.9746375 Thirdly -0.098042734 +-3.9746375 wider -0.098042734 +-3.9746375 scale -0.098042734 +-3.8506508 necessity -0.098042734 +-3.9746375 economical -0.098042734 +-3.4271748 sense -0.3119509 +-3.8506508 businesses -0.098042734 +-3.8506508 require -0.098042734 +-3.678229 force -0.098042734 +-3.8506508 flexible -0.098042734 +-3.9746375 fill -0.098042734 +-2.645509 would -0.25441036 +-3.678229 member -0.35337758 +-3.9746375 staff -0.098042734 +-3.9746375 contrast -0.098042734 +-3.5348082 arguments -0.098042734 +-3.340989 however -0.24782339 +-3.9746375 cope -0.098042734 +-3.8506508 depends -0.098042734 +-3.5348082 situation -0.21423665 +-3.9746375 determined -0.098042734 +-3.9746375 condition -0.098042734 +-3.2074509 year -0.17723958 +-3.8506508 type -0.21423665 +-3.9746375 builds -0.098042734 +-3.9746375 encourages -0.098042734 +-3.8506508 stake -0.098042734 +-2.7521589 education -0.32130542 +-3.8506508 prepares -0.21423665 +-3.0229268 world -0.18407424 +-3.9746375 encounter -0.098042734 +-3.9746375 countless -0.098042734 +-3.9746375 acquired -0.098042734 +-3.9746375 readily -0.098042734 +-3.9746375 apparent -0.098042734 +-3.5348082 balancing -0.098042734 +-3.9746375 Skills -0.098042734 +-3.9746375 prioritization -0.098042734 +-3.9746375 multitasking -0.098042734 +-3.5348082 finding -0.23276964 +-3.2074509 success -0.14086568 +-2.9536004 going -0.5208247 +-3.340989 provides -0.13342881 +-3.8506508 perfect -0.098042734 +-3.5348082 training -0.1522653 +-3.8506508 ground -0.098042734 +-3.678229 improve -0.098042734 +-3.9746375 Too -0.098042734 +-3.5348082 burden -0.1522653 +-3.8506508 supported -0.098042734 +-3.4271748 entirely -0.098042734 +-3.9746375 consequence -0.098042734 +-3.8506508 concept -0.21423665 +-3.678229 actual -0.1522653 +-3.340989 cost -0.45026425 +-3.9746375 Without -0.098042734 +-3.8506508 `` -0.098042734 +-3.678229 '' -0.098042734 +-3.2691073 place -0.12430835 +-3.340989 importance -0.1605951 +-3.153469 were -0.13679342 +-3.9746375 coming -0.098042734 +-2.7136247 out -0.22350419 +-3.340989 own -0.10952071 +-3.9746375 pockets -0.098042734 +-3.9746375 That -0.098042734 +-2.866774 where -0.23291408 +-3.8506508 contribute -0.098042734 +-3.678229 giving -0.098042734 +-3.4271748 value -0.20895563 +-3.2691073 find -0.15366085 +-3.4271748 transition -0.20720947 +-3.5348082 campus -0.14086568 +-3.9746375 campuses -0.098042734 +-3.9746375 unique -0.098042734 +-3.9746375 environments -0.098042734 +-3.9746375 passion -0.098042734 +-2.8413033 learning -0.21896097 +-3.9746375 discovery -0.098042734 +-3.9746375 cultivated -0.098042734 +-3.8506508 continue -0.098042734 +-3.9746375 maintain -0.098042734 +-3.678229 explore -0.098042764 +-3.9746375 talents -0.098042734 +-3.4271748 interests -0.098042734 +-3.153469 full -0.35333085 +-3.8506508 extent -0.098042734 +-2.922689 without -0.14086564 +-3.8506508 worrying -0.21423665 +-2.866774 about -0.12520002 +-3.8506508 translate -0.098042734 +-3.678229 directly -0.17195599 +-3.9746375 salable -0.098042734 +-3.9746375 product -0.098042734 +-3.9746375 ones -0.098042734 +-3.8506508 expose -0.21423665 +-3.8506508 partially -0.098042734 +-3.9746375 awaits -0.098042734 +-3.9746375 Should -0.098042734 +-3.8506508 answer -0.098042734 +-3.8506508 question -0.098042734 +-3.9746375 subjective -0.098042734 +-3.9746375 blanket -0.098042734 +-3.9746375 yes-or-no -0.098042734 +-3.8506508 overlook -0.21423665 +-3.4271748 various -0.098042734 +-3.678229 factors -0.098042734 +-3.340989 different -0.13342881 +-3.5348082 limited -0.1522653 +-3.9746375 supplies -0.098042734 +-3.9746375 probable -0.098042734 +-3.4271748 impact -0.20720947 +-3.8506508 energy -0.098042734 +-3.9746375 reduction -0.098042734 +-3.8506508 effectiveness -0.098042734 +-3.9746375 completing -0.098042734 +-3.5348082 assignments -0.098042734 +-3.9746375 relevance -0.098042734 +-3.9746375 Considerations -0.098042734 +-3.9746375 pertaining -0.098042734 +-3.9746375 fairly -0.098042734 +-3.8506508 straightforward -0.098042734 +-3.2691073 either -0.098042764 +-3.5348082 requires -0.098042734 +-3.0229268 support -0.16036421 +-3.9746375 rises -0.098042734 +-3.9746375 falls -0.098042734 +-3.9746375 relation -0.098042734 +-3.2691073 second -0.18983142 +-3.2691073 never -0.14589529 +-2.9868817 too -0.28712735 +-3.340989 great -0.098042734 +-3.8506508 relative -0.21423665 +-3.8506508 addition -0.21423665 +-3.9746375 scholastic -0.098042734 +-3.678229 keep -0.098042734 +-3.678229 last -0.098042734 +-3.8506508 complicated -0.098042734 +-3.8506508 determination -0.098042734 +-3.8506508 stand -0.098042734 +-3.5348082 particular -0.14086568 +-3.8506508 convenience -0.098042734 +-3.9746375 store -0.098042734 +-3.2074509 someone -0.098042734 +-3.2691073 business -0.18983142 +-3.9746375 Each -0.098042734 +-3.678229 weigh -0.098042734 +-3.678229 decide -0.1522653 +-3.4271748 based -0.37108302 +-3.9746375 particulars -0.098042734 +-3.5348082 feel -0.1281937 +-3.340989 allows -0.47831634 +-3.340989 start -0.21344972 +-3.8506508 building -0.098042734 +-3.9746375 strong -0.098042734 +-3.9746375 ethic -0.098042734 +-3.8506508 ease -0.17195599 +-3.9746375 Young -0.098042734 +-3.678229 subjects -0.098042734 +-2.9536004 learn -0.27956232 +-3.2691073 lessons -0.19561017 +-3.678229 successful -0.098042734 +-3.9746375 saving -0.098042734 +-3.340989 free -0.13342881 +-3.9746375 structure -0.098042734 +-3.9746375 divide -0.098042734 +-3.678229 areas -0.098042734 +-3.678229 contacts -0.098042734 +-3.5348082 best -0.098042734 +-3.9746375 intends -0.098042734 +-3.9746375 pursue -0.098042734 +-3.9746375 exhausting -0.098042734 +-3.9746375 So -0.098042764 +-3.340989 let -0.1605951 +-3.5348082 becomes -0.098042734 +-3.9746375 time-consuming -0.098042734 +-3.8506508 stressful -0.098042734 +-3.9746375 priorities -0.098042734 +-3.9746375 straight -0.098042734 +-3.5348082 quit -0.098042764 +-3.153469 rather -0.62635547 +-2.9868817 than -0.116145 +-3.8506508 sacrifice -0.098042734 +-3.9746375 Despite -0.098042734 +-3.8506508 risks -0.098042734 +-3.4271748 though -0.098042734 +-3.2691073 beneficial -0.18983142 +-3.9746375 imply -0.098042734 +-3.2074509 just -0.12131027 +-3.8506508 pocket -0.21423665 +-3.8506508 Some -0.1281937 +-3.2691073 possible -0.19513752 +-3.0229268 those -0.21913707 +-3.4271748 final -0.18837766 +-2.8938322 years -0.1851878 +-3.8506508 medicine -0.098042734 +-3.2691073 three -0.24782339 +-3.8506508 lesson -0.098042734 +-3.0622365 before -0.20159532 +-3.5348082 enter -0.12430835 +-3.5348082 taste -0.274173 +-3.2691073 come -0.22739327 +-3.678229 manage -0.098042734 +-3.9746375 secure -0.098042734 +-3.4271748 means -0.18983142 +-3.8506508 relying -0.21423665 +-3.8506508 organizations -0.098042734 +-3.9746375 Perhaps -0.098042734 +-3.9746375 Lastly -0.098042734 +-3.9746375 lying -0.098042734 +-3.678229 beyond -0.17195599 +-3.9746375 gates -0.098042734 +-3.1054616 outside -0.4612686 +-3.9746375 circles -0.098042734 +-3.4271748 meet -0.12430835 +-3.0622365 new -0.11185315 +-3.8506508 offers -0.098042734 +-3.9746375 considered -0.098042734 +-3.1054616 against -0.31554228 +-3.5348082 load -0.098042734 +-2.866774 much -0.1445523 +-3.8506508 hinder -0.098042734 +-3.5348082 performance -0.1522653 +-3.340989 books -0.18983142 +-3.9746375 hindrance -0.098042734 +-3.2691073 worked -0.118926615 +-2.5472004 my -0.18626033 +-3.340989 did -0.17723958 +-3.0229268 really -0.11097703 +-3.2691073 until -0.098042764 +-3.9746375 weeks -0.098042734 +-3.8506508 workload -0.098042734 +-3.2074509 was -0.098042734 +-3.9746375 huge -0.098042734 +-3.2074509 go -0.21358076 +-3.8506508 conclude -0.098042734 +-3.4271748 now -0.14086568 +-3.2074509 needs -0.22443497 +-3.340989 costs -0.22739327 +-3.678229 certain -0.098042734 +-3.1054616 understand -0.3119509 +-3.8506508 matter -0.098042734 +-3.4271748 every -0.098042734 +-3.4271748 afford -0.1281937 +-3.9746375 maintaining -0.098042734 +-3.9746375 son -0.098042734 +-3.9746375 daughter -0.098042734 +-3.8506508 dedicate -0.21423665 +-3.5348082 him -0.098042734 +-3.9746375 herself -0.098042734 +-3.9746375 moreover -0.098042734 +-2.9868817 getting -0.18788324 +-3.678229 higher -0.098042734 +-3.9746375 Nevertheless -0.098042734 +-3.678229 extreme -0.098042734 +-3.9746375 maturing -0.098042734 +-3.9746375 consume -0.098042734 +-3.9746375 momentum -0.098042734 +-3.9746375 preciously -0.098042734 +-3.9746375 innumerous -0.098042734 +-2.922689 hard -0.2925273 +-3.8506508 so-called -0.098042734 +-3.4271748 back -0.14086568 +-3.8506508 classrooms -0.098042734 +-3.4271748 lack -0.37108302 +-3.8506508 power -0.098042734 +-3.5348082 concentrate -0.14086568 +-3.8506508 active -0.098042734 +-3.9746375 role -0.098042734 +-3.678229 process -0.17195599 +-2.9868817 ! -0.707252 +-3.9746375 resting -0.098042734 +-3.9746375 acquiring -0.098042734 +-3.9746375 exclusively -0.098042734 +-3.340989 poor -0.098042734 +-3.678229 participation -0.17195599 +-3.9746375 dedicated -0.098042734 +-3.8506508 solution -0.098042734 +-3.9746375 scholarships -0.098042734 +-3.8506508 government -0.098042734 +-3.4271748 system -0.098042734 +-3.5348082 allow -0.098042734 +-3.678229 practice -0.098042734 +-3.8506508 comparison -0.098042734 +-2.8413033 young -0.4036615 +-3.4271748 adults -0.20720947 +-3.8506508 tend -0.21423665 +-3.2691073 always -0.098042734 +-3.9746375 specialisation -0.098042734 +-3.9746375 counterparts -0.098042734 +-3.9746375 Generally -0.098042734 +-3.9746375 non -0.098042734 +-3.9746375 adapt -0.098042734 +-3.8506508 relatively -0.098042734 +-3.5348082 short -0.098042734 +-3.4271748 problem -0.098042764 +-3.2691073 since -0.12131027 +-3.2691073 change -0.1605951 +-3.9746375 manageable -0.098042734 +-3.678229 concerned -0.098042734 +-3.8506508 sophisticated -0.098042734 +-3.8506508 whilst -0.098042734 +-3.9746375 furthering -0.098042734 +-3.5348082 insight -0.1522653 +-3.9746375 assimilation -0.098042734 +-3.9746375 bridging -0.098042734 +-3.340989 effect -0.13342881 +-3.678229 enables -0.098042734 +-3.8506508 quickly -0.098042734 +-3.9746375 leap -0.098042734 +-3.8506508 Working -0.1281937 +-3.5348082 teaches -0.23276964 +-3.8506508 theory -0.098042734 +-3.2691073 universities -0.1281937 +-3.5348082 likely -0.14086568 +-3.9746375 budgeting -0.098042734 +-3.9746375 ethics -0.098042734 +-3.0622365 workplace -0.16288683 +-3.8506508 productivity -0.098042734 +-3.9746375 manner -0.098042734 +-3.9746375 issues -0.098042734 +-3.340989 instead -0.3119509 +-3.9746375 core -0.098042734 +-3.8506508 material -0.098042734 +-3.9746375 respective -0.098042734 +-3.9746375 We -0.098042734 +-3.9746375 hope -0.098042734 +-3.8506508 principle -0.098042734 +-3.5348082 concern -0.098042734 +-3.9746375 But -0.098042734 +-3.9746375 definition -0.098042734 +-3.8506508 worker -0.098042734 +-3.9746375 follows -0.098042734 +-3.4271748 problems -0.20720947 +-3.340989 perhaps -0.098042734 +-3.2074509 fully -0.098042734 +-3.2691073 families -0.098042734 +-3.9746375 Others -0.098042734 +-3.9746375 older -0.098042734 +-3.9746375 saved -0.098042734 +-3.678229 courses -0.098042734 +-3.9746375 began -0.098042734 +-3.8506508 fall -0.098042734 +-3.9746375 somewhere -0.098042734 +-3.2074509 between -0.12430835 +-3.9746375 bit -0.098042734 +-3.9746375 book -0.098042734 +-3.4271748 hold -0.098042734 +-3.8506508 escape -0.21423665 +-3.9746375 confines -0.098042734 +-3.8506508 worlds -0.098042734 +-3.8506508 brought -0.098042734 +-3.4271748 far -0.098042734 +-3.9746375 dip -0.098042734 +-3.9746375 toes -0.098042734 +-3.9746375 unfathomable -0.098042734 +-3.9746375 waters -0.098042734 +-3.9746375 immersed -0.098042734 +-3.9746375 Thereby -0.098042734 +-3.8506508 preparing -0.17195599 +-3.9746375 traverse -0.098042734 +-3.678229 rest -0.098042734 +-3.340989 lives -0.13679342 +-3.9746375 Wages -0.098042734 +-3.9746375 gainful -0.098042734 +-3.8506508 sample -0.098042734 +-3.9746375 delights -0.098042734 +-3.1054616 want -0.2983345 +-3.678229 try -0.1522653 +-2.866774 things -0.21986672 +-3.9746375 previously -0.098042734 +-3.9746375 unavailable -0.098042734 +-3.8506508 rounded -0.098042734 +-3.4271748 takes -0.13342881 +-3.8506508 hopefully -0.098042734 +-3.340989 appreciate -0.18983142 +-3.9746375 paying -0.098042734 +-3.9746375 next -0.098042734 +-3.8506508 round -0.098042734 +-3.9746375 drinks -0.098042734 +-3.9746375 Students -0.098042734 +-3.8506508 definitely -0.098042734 +-3.5348082 harder -0.098042734 +-3.8506508 suffer -0.098042734 +-3.9746375 Suffering -0.098042734 +-3.8506508 wo -0.21423665 +-3.9746375 kill -0.098042734 +-3.9746375 Part -0.098042734 +-3.678229 fun -0.17195599 +-3.9746375 Jobs -0.098042734 +-3.9746375 mostly -0.098042734 +-3.4271748 basic -0.098042734 +-3.678229 simple -0.17195599 +-3.4271748 tasks -0.098042734 +-3.8506508 suffering -0.098042734 +-3.5348082 labor -0.098042734 +-3.9746375 fatal -0.098042734 +-3.9746375 timing -0.098042734 +-3.340989 Japan -0.13847902 +-3.8506508 America -0.1522653 +-3.5348082 rent -0.098042734 +-3.8506508 married -0.098042734 +-3.2691073 generally -0.098042764 +-3.2691073 freedom -0.19513752 +-3.9746375 sophomore -0.098042734 +-3.9746375 sophomores -0.098042734 +-3.9746375 4th -0.098042734 +-3.678229 service -0.098042734 +-3.9746375 prime -0.098042734 +-3.340989 purpose -0.2741563 +-3.9746375 Cleaning -0.098042734 +-3.9746375 toilets -0.098042734 +-3.9746375 serving -0.098042734 +-3.9746375 burgers -0.098042734 +-3.2691073 waste -0.1605951 +-3.2691073 think -0.20801851 +-3.2691073 anything -0.1281937 +-3.8506508 argue -0.098042734 +-3.9746375 Maybe -0.098042734 +-3.5348082 serve -0.098042734 +-3.9746375 clean -0.098042734 +-3.8506508 dishes -0.098042734 +-3.9746375 pales -0.098042734 +-3.9746375 international -0.098042734 +-3.5348082 law -0.14086568 +-3.8506508 advanced -0.098042734 +-3.9746375 mechanical -0.098042734 +-3.4271748 focused -0.23276964 +-3.678229 truly -0.098042734 +-3.4271748 why -0.12131027 +-3.2691073 made -0.098042734 +-3.9746375 affordable -0.098042734 +-3.9746375 smart -0.098042734 +-3.9746375 gifted -0.098042734 +-3.8506508 proper -0.098042734 +-3.4271748 reduce -0.20720947 +-3.9746375 firmly -0.098042734 +-3.9746375 encouraged -0.098042734 +-3.678229 reach -0.098042734 +-3.9746375 meaningless -0.098042734 +-3.9746375 unchallenging -0.098042734 +-3.9746375 pointless -0.098042734 +-3.5348082 anyone -0.098042734 +-3.678229 fortunate -0.098042734 +-3.9746375 attend -0.098042734 +-3.8506508 everyone -0.098042734 +-3.5348082 wisely -0.1522653 +-3.678229 moving -0.17195599 +-3.8506508 refrain -0.21423665 +-3.8506508 circle -0.098042734 +-3.8506508 knows -0.098042734 +-3.8506508 precious -0.098042734 +-3.9746375 commodity -0.098042734 +-3.9746375 Consider -0.098042734 +-3.9746375 average -0.098042734 +-3.678229 20 -0.35337758 +-3.1054616 hours -0.22398719 +-3.8506508 5 -0.098042734 +-3.2074509 days -0.14504822 +-3.5348082 week -0.118926615 +-3.9746375 Getting -0.098042734 +-3.8506508 ready -0.098042734 +-3.9746375 driving -0.098042734 +-3.678229 easily -0.098042734 +-3.5348082 hour -0.098042734 +-3.8506508 each -0.098042734 +-2.9868817 day -0.19507152 +-3.9746375 25 -0.098042734 +-3.8506508 lost -0.098042734 +-3.8506508 per -0.21423665 +-3.678229 risk -0.17195599 +-3.8506508 cause -0.098042734 +-3.8506508 added -0.098042734 +-3.8506508 frustration -0.098042734 +-3.9746375 friendly -0.098042734 +-3.8506508 coworkers -0.098042734 +-3.9746375 bound -0.098042734 +-3.9746375 tempting -0.098042734 +-3.9746375 urged -0.098042734 +-3.678229 soon -0.098042734 +-3.8506508 First -0.17195599 +-3.8506508 dependent -0.098042734 +-3.9746375 child -0.098042734 +-3.9746375 rapidly -0.098042734 +-3.9746375 underway -0.098042734 +-3.9746375 Quite -0.098042734 +-3.9746375 moved -0.098042734 +-3.1054616 home -0.29263094 +-3.0229268 responsibility -0.19561017 +-3.9746375 ranging -0.098042734 +-3.9746375 bed -0.098042734 +-3.678229 sure -0.098042734 +-3.8506508 homework -0.098042734 +-3.8506508 completed -0.098042734 +-3.9746375 dormitories -0.098042734 +-3.9746375 breakfast -0.098042734 +-3.678229 lunch -0.098042734 +-3.9746375 dinner -0.098042734 +-3.9746375 never-the-less -0.098042734 +-3.8506508 build -0.098042734 +-3.9746375 identity -0.098042734 +-3.8506508 fve -0.098042734 +-3.678229 mentioned -0.098042734 +-3.9746375 admit -0.098042734 +-3.8506508 seem -0.098042734 +-3.9746375 handle -0.098042734 +-3.340989 person -0.13679342 +-3.9746375 teens -0.098042734 +-3.4271748 early -0.14086568 +-3.9746375 twenties -0.098042734 +-3.8506508 adding -0.17195599 +-3.9746375 unnecessary -0.098042734 +-3.9746375 contrary -0.098042734 +-3.9746375 aforementioned -0.098042734 +-3.8506508 list -0.21423665 +-3.9746375 remuneration -0.098042734 +-3.9746375 satisfaction -0.098042734 +-3.678229 gaining -0.098042734 +-3.340989 independence -0.12131027 +-3.8506508 enormous -0.098042734 +-3.9746375 self-esteem -0.098042734 +-3.8506508 manifest -0.098042734 +-3.9746375 improved -0.098042734 +-3.5348082 growing -0.274173 +-3.8506508 securing -0.098042734 +-3.8506508 dating -0.098042734 +-3.8506508 easier -0.098042734 +-3.8506508 ask -0.098042734 +-3.8506508 view -0.098042734 +-3.678229 big -0.098042734 +-3.8506508 party -0.098042734 +-3.4271748 drinking -0.14086568 +-3.8506508 joining -0.098042734 +-3.4271748 clubs -0.14086568 +-3.9746375 washing -0.098042734 +-3.9746375 My -0.098042734 +-3.9746375 create -0.098042734 +-3.9746375 Australia -0.098042734 +-3.678229 studied -0.098042734 +-3.4271748 University -0.098042734 +-3.9746375 Technology -0.098042734 +-3.9746375 Sydney -0.098042734 +-3.9746375 8,000 -0.098042734 +-3.9746375 75 -0.098042734 +-3.8506508 % -0.098042734 +-3.8506508 knew -0.098042734 +-3.2691073 companies -0.098042734 +-3.9746375 hire -0.098042734 +-3.8506508 test -0.098042734 +-3.9746375 character -0.098042734 +-3.8506508 positive -0.098042734 +-3.8506508 thing -0.098042734 +-3.2074509 point -0.1522653 +-3.9746375 indirectly -0.098042734 +-3.9746375 cheaper -0.098042734 +-3.9746375 taxpayers -0.098042734 +-3.8506508 summary -0.098042734 +-3.9746375 Overall -0.098042734 +-3.2691073 both -0.098042734 +-3.678229 economy -0.35337758 +-3.8506508 easy -0.098042734 +-3.9746375 hide -0.098042734 +-3.8506508 stay -0.098042734 +-3.9746375 attached -0.098042734 +-3.9746375 backdrop -0.098042734 +-3.9746375 lots -0.098042734 +-3.9746375 bare -0.098042734 +-3.9746375 facts -0.098042734 +-3.678229 obtain -0.1281937 +-3.9746375 converted -0.098042734 +-3.8506508 wisdom -0.098042734 +-3.9746375 broader -0.098042734 +-3.9746375 departments -0.098042734 +-3.9746375 strictly -0.098042734 +-3.8506508 common -0.098042734 +-3.9746375 More -0.098042734 +-3.8506508 break -0.098042734 +-3.678229 -- -0.098042734 +-3.9746375 subconscious -0.098042734 +-3.9746375 processes -0.098042734 +-3.9746375 information -0.098042734 +-3.0622365 class -0.25014567 +-3.5348082 concentration -0.1522653 +-3.8506508 connections -0.098042734 +-3.8506508 ca -0.21423665 +-3.9746375 pure -0.098042734 +-3.9746375 association -0.098042734 +-3.9746375 faculty -0.098042734 +-3.8506508 enrich -0.21423665 +-3.8506508 whose -0.098042734 +-3.678229 ultimately -0.098042734 +-3.9746375 remain -0.098042734 +-3.678229 company -0.17195599 +-3.9746375 network -0.098042734 +-3.678229 end -0.1522653 +-3.9746375 needing -0.098042734 +-3.9746375 Student -0.098042734 +-3.9746375 record -0.098042734 +-3.8506508 references -0.098042734 +-3.2691073 individual -0.115375474 +-3.8506508 coursework -0.098042734 +-3.8506508 applications -0.098042734 +-3.9746375 Practical -0.098042734 +-3.8506508 background -0.098042734 +-3.9746375 principles -0.098042734 +-3.0622365 he -0.16875425 +-3.678229 she -0.14086568 +-3.9746375 specific -0.098042734 +-3.8506508 wants -0.098042734 +-3.9746375 Real-world -0.098042734 +-3.5348082 mature -0.1522653 +-3.9746375 functioning -0.098042734 +-3.5348082 members -0.47831634 +-3.8506508 instils -0.098042734 +-3.2691073 discipline -0.2741563 +-3.9746375 fulfill -0.098042734 +-3.678229 expectations -0.1522653 +-3.9746375 abide -0.098042734 +-3.9746375 rules -0.098042734 +-3.9746375 filing -0.098042734 +-3.9746375 correspondence -0.098042734 +-3.9746375 organized -0.098042734 +-3.9746375 attentive -0.098042734 +-3.9746375 details -0.098042734 +-3.5348082 realize -0.23276964 +-3.5348082 co-workers -0.098042734 +-3.678229 rely -0.35337758 +-3.9746375 efficient -0.098042734 +-3.9746375 motivate -0.098042734 +-3.678229 diligent -0.17195599 +-3.9746375 earlier -0.098042734 +-3.2074509 age -0.18439111 +-3.9746375 Second -0.098042734 +-3.9746375 broadens -0.098042734 +-3.4271748 interest -0.23276964 +-3.9746375 opens -0.098042734 +-3.9746375 array -0.098042734 +-3.5348082 possibilities -0.1522653 +-3.678229 yet -0.098042734 +-3.8506508 clear -0.098042734 +-3.9746375 Actual -0.098042734 +-3.5348082 determine -0.098042734 +-3.5348082 kind -0.47831634 +-3.9746375 likes -0.098042734 +-3.9746375 dislikes -0.098042734 +-3.9746375 managerial -0.098042734 +-3.8506508 finance -0.098042734 +-3.678229 human -0.098042734 +-3.5348082 resources -0.1522653 +-3.9746375 interns -0.098042734 +-3.9746375 advertising -0.098042734 +-3.9746375 agency -0.098042734 +-3.678229 discover -0.098042734 +-3.9746375 aptitude -0.098042734 +-3.9746375 graphic -0.098042734 +-3.9746375 design -0.098042734 +-3.9746375 copywriting -0.098042734 +-3.9746375 hands-on -0.098042734 +-3.9746375 richer -0.098042734 +-3.678229 basis -0.098042734 +-3.678229 planning -0.098042734 +-3.9746375 waiting -0.098042734 +-3.340989 around -0.098042734 +-3.9746375 rigors -0.098042734 +-3.9746375 dynamics -0.098042734 +-3.9746375 Usually -0.098042734 +-3.678229 confidence -0.17195599 +-3.9746375 ggreen -0.098042734 +-3.5348082 h -0.098042734 +-3.9746375 familiarize -0.098042734 +-3.8506508 office -0.098042734 +-3.678229 hierarchies -0.098042734 +-3.9746375 situate -0.098042734 +-3.9746375 newcomer -0.098042734 +-3.8506508 guidance -0.21423665 +-3.9746375 mentoring -0.098042734 +-3.9746375 Attending -0.098042734 +-3.9746375 writing -0.098042734 +-3.9746375 essays -0.098042734 +-3.8506508 reports -0.098042734 +-3.8506508 side -0.098042734 +-3.9746375 normally -0.098042734 +-3.678229 taken -0.1522653 +-3.5348082 needed -0.098042734 +-3.8506508 turn -0.098042734 +-3.9746375 sleep -0.098042734 +-3.678229 essential -0.14086568 +-3.2691073 later -0.22839211 +-3.2691073 actually -0.098042764 +-3.9746375 Along -0.098042734 +-3.9746375 career-oriented -0.098042734 +-3.678229 choosing -0.17195599 +-3.9746375 All -0.098042764 +-3.9746375 secretary -0.098042734 +-3.678229 manager -0.098042734 +-3.8506508 treasurer -0.098042734 +-3.9746375 Taking -0.098042734 +-3.9746375 instance -0.098042734 +-3.9746375 plan -0.098042734 +-3.9746375 branch -0.098042734 +-3.678229 budget -0.098042734 +-3.9746375 fd -0.098042734 +-3.9746375 saying -0.098042734 +-3.9746375 inside -0.098042734 +-3.678229 concentrating -0.35337758 +-3.8506508 fact -0.098042734 +-3.4271748 hand -0.17723958 +-3.8506508 loss -0.21423665 +-3.9746375 genuine -0.098042734 +-3.8506508 engaging -0.21423665 +-3.9746375 play -0.098042734 +-3.9746375 instrument -0.098042734 +-3.9746375 sports -0.098042734 +-3.340989 simply -0.098042764 +-3.8506508 hang -0.21423665 +-3.9746375 friend -0.098042734 +-3.678229 developing -0.098042734 +-3.9746375 danger -0.098042734 +-3.9746375 ending -0.098042734 +-3.9746375 materialistic -0.098042734 +-3.9746375 inclined -0.098042734 +-3.9746375 diminished -0.098042734 +-3.9746375 understating -0.098042734 +-2.8413033 our -0.108908966 +-3.9746375 beings -0.098042734 +-3.9746375 ... -0.098042734 +-2.6955683 we -0.20619403 +-3.9746375 sooner -0.098042734 +-3.4271748 opinion -0.1985482 +-3.9746375 rush -0.098042734 +-3.8506508 concentrated -0.098042734 +-3.5348082 its -0.098042734 +-3.340989 us -0.098042764 +-3.678229 causes -0.098042734 +-3.9746375 unhappiness -0.098042734 +-3.9746375 love -0.098042734 +-3.9746375 sole -0.098042734 +-3.678229 few -0.098042734 +-3.9746375 bucks -0.098042734 +-3.8506508 distraction -0.098042734 +-3.678229 further -0.098042734 +-3.9746375 cloud -0.098042734 +-3.9746375 tricky -0.098042734 +-3.9746375 monetary -0.098042734 +-3.9746375 enslavement -0.098042734 +-3.4271748 live -0.098042734 +-3.8506508 under -0.098042734 +-3.9746375 span -0.098042734 +-3.9746375 childhood -0.098042734 +-3.9746375 assure -0.098042734 +-3.678229 mental -0.098042734 +-3.9746375 stability -0.098042734 +-3.9746375 Every -0.098042734 +-2.866774 his -0.13503471 +-3.4271748 her -0.098042764 +-3.9746375 No -0.098042734 +-3.9746375 deviation -0.098042734 +-3.678229 allowed -0.274173 +-3.8506508 absolutely -0.098042734 +-3.9746375 executed -0.098042734 +-3.8506508 conditions -0.098042734 +-3.9746375 Studying -0.098042734 +-3.8506508 influence -0.098042734 +-3.678229 sometimes -0.098042734 +-3.9746375 fate -0.098042734 +-3.9746375 Depending -0.098042734 +-3.9746375 destiny -0.098042734 +-3.9746375 entire -0.098042734 +-3.8506508 generations -0.098042734 +-3.8506508 damage -0.098042734 +-3.9746375 irreversible -0.098042734 +-3.9746375 qualified -0.098042734 +-3.9746375 causing -0.098042734 +-3.9746375 nation -0.098042734 +-3.9746375 Even -0.098042734 +-3.8506508 direct -0.098042734 +-3.9746375 wastes -0.098042734 +-3.9746375 production -0.098042734 +-3.678229 above -0.098042734 +-3.8506508 institutions -0.098042734 +-3.9746375 priority -0.098042734 +-3.9746375 agendas -0.098042734 +-3.9746375 excuses -0.098042734 +-3.9746375 delay -0.098042734 +-3.9746375 action -0.098042734 +-3.9746375 accepted -0.098042734 +-3.2074509 right -0.15792 +-3.9746375 assured -0.098042734 +-3.9746375 self-improvement -0.098042734 +-3.9746375 worthwhile -0.098042734 +-3.9746375 serves -0.098042734 +-3.9746375 element -0.098042734 +-3.9746375 preparatory -0.098042734 +-3.5348082 stage -0.098042734 +-3.678229 country -0.098042734 +-3.9746375 Asia -0.098042734 +-3.9746375 whereby -0.098042734 +-3.9746375 obliged -0.098042734 +-3.9746375 self-supporting -0.098042734 +-3.340989 save -0.18983142 +-3.9746375 augment -0.098042734 +-3.8506508 whole -0.098042734 +-3.678229 among -0.17195599 +-3.8506508 youth -0.17195599 +-3.9746375 On -0.098042734 +-3.4271748 his\/her -0.098042734 +-3.9746375 promotes -0.098042734 +-3.9746375 duties -0.098042734 +-3.9746375 assigned -0.098042734 +-3.8506508 virtues -0.098042734 +-3.9746375 hard-work -0.098042734 +-3.9746375 teamwork -0.098042734 +-3.5348082 respect -0.1522653 +-3.9746375 authority -0.098042734 +-3.9746375 Although -0.098042734 +-3.9746375 authorities -0.098042734 +-3.8506508 schools -0.098042734 +-3.340989 local -0.098042764 +-3.9746375 governments -0.098042734 +-3.9746375 proactive -0.098042734 +-3.8506508 setting -0.098042734 +-3.8506508 appropriate -0.098042734 +-3.5348082 policy -0.098042734 +-3.9746375 measures -0.098042734 +-3.8506508 avoid -0.098042734 +-3.9746375 pitfalls -0.098042734 +-3.678229 including -0.098042734 +-3.9746375 exploitation -0.098042734 +-3.9746375 abuse -0.098042734 +-3.9746375 excessive -0.098042734 +-3.9746375 absenteeism -0.098042734 +-3.9746375 gross -0.098042734 +-3.9746375 negligence -0.098042734 +-3.8506508 schooling -0.098042734 +-3.8506508 purposes -0.098042734 +-3.9746375 i -0.098042734 +-3.9746375 attain -0.098042734 +-3.9746375 competency -0.098042734 +-3.8506508 attitude -0.098042734 +-3.8506508 desired -0.098042734 +-3.9746375 expertise -0.098042734 +-3.9746375 ii -0.098042734 +-3.8506508 finish -0.098042734 +-3.4271748 lose -0.098042764 +-3.8506508 sight -0.21423665 +-3.9746375 gains -0.098042734 +-3.678229 four -0.17195599 +-3.9746375 corners -0.098042734 +-3.9746375 classroom -0.098042734 +-3.8506508 Thus -0.21423665 +-3.2691073 balance -0.098042734 +-3.9746375 necessarily -0.098042734 +-3.8506508 points -0.098042734 +-3.8506508 argument -0.21423665 +-3.9746375 unless -0.098042734 +-3.9746375 decision -0.098042734 +-3.9746375 attainment -0.098042734 +-3.8506508 organizational -0.21423665 +-3.8506508 socially -0.098042734 +-3.5348082 current -0.098042734 +-3.9746375 consist -0.098042734 +-3.8506508 mainly -0.098042734 +-3.678229 fast -0.098042734 +-3.9746375 manual -0.098042734 +-3.5348082 invaluable -0.098042734 +-3.9746375 targeted -0.098042734 +-3.9746375 Balancing -0.098042734 +-3.678229 schedules -0.098042734 +-3.9746375 arranging -0.098042734 +-3.8506508 effective -0.098042734 +-3.340989 assist -0.17195599 +-3.4271748 becoming -0.37108302 +-3.5348082 daily -0.14086568 +-3.9746375 presently -0.098042734 +-3.8506508 prepare -0.098042734 +-3.8506508 car -0.098042734 +-3.8506508 post -0.098042734 +-3.678229 careers -0.21423665 +-3.9746375 passed -0.098042734 +-3.8506508 extremely -0.098042734 +-3.9746375 Potential -0.098042734 +-3.9746375 impressed -0.098042734 +-3.9746375 interviews -0.098042734 +-3.8506508 cities -0.098042734 +-3.9746375 socio-economic -0.098042734 +-3.9746375 backgrounds -0.098042734 +-3.9746375 partly -0.098042734 +-3.2691073 complete -0.12430835 +-3.9746375 professionally -0.098042734 +-3.9746375 educated -0.098042734 +-3.678229 contributing -0.17195599 +-3.9746375 prepared -0.098042734 +-3.9746375 challenges -0.098042734 +-3.9746375 ends -0.098042734 +-3.340989 develop -0.12430835 +-3.8506508 stronger -0.098042734 +-3.9746375 When -0.098042734 +-3.9746375 appropriately -0.098042734 +-3.9746375 dressed -0.098042734 +-3.9746375 consequences -0.098042734 +-3.9746375 actions -0.098042734 +-3.9746375 She -0.098042734 +-3.8506508 wait -0.21423665 +-3.9746375 naive -0.098042734 +-3.9746375 mistakes -0.098042734 +-3.678229 critical -0.098042734 +-3.9746375 favor -0.098042734 +-3.8506508 variety -0.21423665 +-3.8506508 classmates -0.098042734 +-3.9746375 maybe -0.098042734 +-3.4271748 together -0.14086568 +-3.8506508 thinking -0.098042734 +-3.9746375 closer -0.098042734 +-3.9746375 along -0.098042734 +-3.9746375 belief -0.098042734 +-3.8506508 recognize -0.098042734 +-3.678229 lectures -0.17195599 +-3.8506508 meetings -0.098042734 +-3.9746375 Otherwise -0.098042734 +-3.9746375 poorer -0.098042734 +-3.9746375 disadvantage -0.098042734 +-3.9746375 compared -0.098042734 +-3.8506508 wealthy -0.098042734 +-3.5348082 pass -0.098042734 +-3.678229 ever -0.098042734 +-3.8506508 experiencing -0.098042734 +-3.678229 regular -0.098042734 +-3.9746375 empathy -0.098042734 +-3.8506508 modern -0.098042734 +-3.9746375 politics -0.098042734 +-3.9746375 justice -0.098042734 +-3.9746375 Universities -0.098042734 +-3.9746375 award -0.098042734 +-3.9746375 credits -0.098042734 +-3.9746375 volunteer -0.098042734 +-3.9746375 all-round -0.098042734 +-3.9746375 count -0.098042734 +-3.5348082 towards -0.098042734 +-3.9746375 degrees -0.098042734 +-3.9746375 School -0.098042734 +-3.678229 expensive -0.098042734 +-3.9746375 costly -0.098042734 +-3.9746375 assisting -0.098042734 +-3.678229 busy -0.098042734 +-3.9746375 professionals -0.098042734 +-3.9746375 sources -0.098042734 +-3.8506508 doubt -0.098042734 +-3.2691073 spent -0.14589529 +-3.9746375 figuring -0.098042734 +-3.678229 personally -0.098042734 +-3.9746375 usual -0.098042734 +-3.4271748 form -0.37108302 +-3.9746375 profession -0.098042734 +-3.8506508 nice -0.098042734 +-3.9746375 speed -0.098042734 +-3.9746375 certified -0.098042734 +-3.8506508 receive -0.098042734 +-3.8506508 faster -0.098042734 +-3.9746375 raise -0.098042734 +-3.5348082 children -0.1605951 +-3.4271748 choose -0.18983142 +-3.9746375 interact -0.098042734 +-3.678229 community -0.098042734 +-3.9746375 epeople-skills -0.098042734 +-3.0622365 had -0.22503476 +-3.9746375 14 -0.098042734 +-3.678229 old -0.17195599 +-3.9746375 pharmacy -0.098042734 +-3.9746375 capable -0.098042734 +-3.8506508 appreciation -0.21423665 +-3.4271748 required -0.18837766 +-3.9746375 interacting -0.098042734 +-3.5348082 public -0.14086568 +-3.9746375 juggle -0.098042734 +-3.9746375 surprised -0.098042734 +-3.9746375 came -0.098042734 +-3.8506508 bring -0.098042734 +-3.9746375 maturity -0.098042734 +-3.678229 currently -0.098042734 +-3.5348082 purely -0.098042734 +-3.9746375 Hence -0.098042734 +-3.8506508 1 -0.098042734 +-3.9746375 interesting -0.098042734 +-3.9746375 replacement -0.098042734 +-3.678229 2 -0.098042734 +-3.340989 everything -0.098042734 +-3.5348082 given -0.098042734 +-3.678229 3 -0.098042734 +-3.9746375 relates -0.098042734 +-3.9746375 Sometimes -0.098042734 +-3.9746375 salary -0.098042734 +-3.9746375 resume -0.098042734 +-3.8506508 4 -0.098042734 +-3.9746375 mentors -0.098042734 +-3.9746375 Thank -0.098042734 +-3.9746375 explain -0.098042734 +-3.9746375 Reason -0.098042734 +-3.9746375 survive -0.098042734 +-3.9746375 With -0.098042734 +-3.9746375 struggle -0.098042734 +-3.9746375 continual -0.098042734 +-3.8506508 excellent -0.098042734 +-3.340989 long -0.098042764 +-3.9746375 length -0.098042734 +-3.8506508 vacation -0.098042734 +-3.9746375 Which -0.098042734 +-3.8506508 greatly -0.098042734 +-3.8506508 wasted -0.21423665 +-3.9746375 lazy -0.098042734 +-3.8506508 unproductive -0.098042734 +-3.9746375 Or -0.098042734 +-3.8506508 nights -0.098042734 +-3.9746375 travel -0.098042734 +-3.9746375 persons -0.098042734 +-3.4271748 growth -0.14086568 +-3.9746375 multicultural -0.098042734 +-3.678229 off -0.098042734 +-3.9746375 eIt -0.098042734 +-3.9746375 employability -0.098042734 +-3.9746375 field\/industry -0.098042734 +-3.9746375 ereal -0.098042734 +-3.9746375 suited -0.098042734 +-3.5348082 look -0.14086568 +-3.8506508 path -0.098042734 +-3.9746375 homework\/assignments -0.098042734 +-3.9746375 At -0.098042734 +-3.9746375 input -0.098042734 +-3.5348082 using -0.098042734 +-3.8506508 initiative -0.098042734 +-3.9746375 solving -0.098042734 +-3.9746375 communication -0.098042734 +-3.8506508 customer -0.098042734 +-3.9746375 uniforms -0.098042734 +-3.9746375 Being -0.098042734 +-3.8506508 brings -0.098042734 +-3.9746375 pointed -0.098042734 +-3.9746375 real-world -0.098042734 +-3.9746375 complement -0.098042734 +-3.9746375 Also -0.098042734 +-3.2074509 spending -0.12430835 +-3.5348082 certainly -0.098042734 +-3.678229 welcome -0.098042734 +-3.4271748 choice -0.098042734 +-3.9746375 complements -0.098042734 +-3.9746375 looking -0.098042734 +-3.4271748 within -0.14086568 +-3.8506508 lab -0.098042734 +-3.678229 research -0.098042734 +-3.9746375 assistant -0.098042734 +-3.8506508 professor -0.098042734 +-3.9746375 well-paying -0.098042734 +-3.9746375 conveniently -0.098042734 +-3.9746375 located -0.098042734 +-3.9746375 content -0.098042734 +-3.9746375 desirable -0.098042734 +-3.678229 interested -0.35337758 +-3.9746375 although -0.098042734 +-3.9746375 scenario -0.098042734 +-3.9746375 leads -0.098042734 +-3.9746375 existentially -0.098042734 +-3.9746375 themed -0.098042734 +-3.9746375 questions -0.098042734 +-3.9746375 Regardless -0.098042734 +-3.8506508 profit -0.098042734 +-3.9746375 prioritize -0.098042734 +-3.8506508 merit -0.098042734 +-3.9746375 significant -0.098042734 +-3.678229 expense -0.098042734 +-3.9746375 capital -0.098042734 +-3.8506508 thereby -0.098042734 +-3.8506508 cover -0.098042734 +-3.9746375 bank -0.098042734 +-3.678229 loan -0.098042734 +-3.9746375 mortgaging -0.098042734 +-3.9746375 house -0.098042734 +-3.678229 puts -0.17195599 +-3.9746375 strain -0.098042734 +-3.9746375 financing -0.098042734 +-3.5348082 materials -0.14086568 +-3.9746375 reducing -0.098042734 +-3.9746375 pressure -0.098042734 +-3.9746375 encouraging -0.098042734 +-3.5348082 large -0.098042734 +-3.9746375 percentage -0.098042734 +-3.9746375 shoppers -0.098042734 +-3.9746375 notice -0.098042734 +-3.8506508 changes -0.098042734 +-3.9746375 fashion -0.098042734 +-3.9746375 technology -0.098042734 +-3.678229 buy -0.098042734 +-3.9746375 latest -0.098042734 +-3.9746375 products -0.098042734 +-3.9746375 reviving -0.098042734 +-3.9746375 shops -0.098042734 +-3.678229 low -0.098042734 +-3.9746375 shop -0.098042734 +-3.9746375 owners -0.098042734 +-3.8506508 onto -0.098042734 +-3.153469 me -0.12131027 +-3.8506508 seems -0.17195599 +-3.8506508 carry -0.098042734 +-3.9746375 task -0.098042734 +-3.5348082 goal -0.098042734 +-3.678229 rewarding -0.17195599 +-3.9746375 mention -0.098042734 +-3.9746375 Basic -0.098042734 +-3.9746375 Looking -0.098042734 +-3.8506508 seeing -0.098042734 +-3.9746375 payback -0.098042734 +-3.5348082 over -0.098042734 +-3.8506508 everyday -0.098042734 +-3.9746375 irresponsible -0.098042734 +-3.9746375 breakdown -0.098042734 +-3.678229 economic -0.098042734 +-3.9746375 happing -0.098042734 +-3.9746375 Using -0.098042734 +-3.9746375 false -0.098042734 +-3.9746375 card -0.098042734 +-3.9746375 constantly -0.098042734 +-3.9746375 pushing -0.098042734 +-3.8506508 care -0.21423665 +-3.9746375 bill -0.098042734 +-3.5348082 banned -0.1281937 +-3.5348082 number -0.47831634 +-3.8506508 goes -0.098042734 +-3.8506508 didn -0.21423665 +-3.9746375 dime -0.098042734 +-3.9746375 tuitions -0.098042734 +-3.8506508 summer -0.098042734 +-3.9746375 vacations -0.098042734 +-3.9746375 summers -0.098042734 +-3.9746375 Furthermore -0.098042734 +-3.5348082 schedule -0.098042734 +-3.9746375 got -0.098042734 +-3.9746375 graduating -0.098042734 +-3.678229 already -0.098042734 +-3.9746375 experienced -0.098042734 +-3.9746375 requirements -0.098042734 +-3.8506508 c -0.098042734 +-3.8506508 seen -0.098042734 +-3.5348082 extra-curricular -0.274173 +-3.9746375 hiring -0.098042734 +-3.9746375 bubble -0.098042734 +-3.678229 leaving -0.098042734 +-3.8506508 head -0.098042734 +-3.5348082 creating -0.1522653 +-3.9746375 failures -0.098042734 +-3.9746375 realistic -0.098042734 +-3.9746375 adjusting -0.098042734 +-3.9746375 \/ -0.098042734 +-3.4271748 employees -0.098042734 +-3.9746375 wont -0.098042734 +-3.9746375 burn -0.098042734 +-3.9746375 gauge -0.098042734 +-3.9746375 assessing -0.098042734 +-3.9746375 multi-task -0.098042734 +-3.9746375 grade -0.098042734 +-3.9746375 averages -0.098042734 +-3.9746375 devised -0.098042734 +-3.9746375 formula -0.098042734 +-3.9746375 feels -0.098042734 +-3.9746375 excels -0.098042734 +-3.9746375 assignment -0.098042734 +-3.8506508 Finally -0.21423665 +-3.678229 general -0.098042734 +-3.9746375 Idle -0.098042734 +-3.9746375 minds -0.098042734 +-3.9746375 deserves -0.098042734 +-3.9746375 needy -0.098042734 +-3.9746375 face -0.098042734 +-3.678229 difficulties -0.098042734 +-3.9746375 Incomes -0.098042734 +-3.9746375 necessities -0.098042734 +-3.8506508 purchase -0.098042734 +-3.8506508 dedication -0.098042734 +-3.9746375 unreasonable -0.098042734 +-3.9746375 inherent -0.098042734 +-3.9746375 structures -0.098042734 +-3.9746375 exist -0.098042734 +-3.9746375 Early -0.098042734 +-3.8506508 formal -0.098042734 +-3.9746375 adjust -0.098042734 +-3.8506508 demand -0.098042734 +-3.9746375 involvement -0.098042734 +-3.9746375 Both -0.098042734 +-3.8506508 private -0.098042734 +-3.8506508 utmost -0.098042734 +-3.8506508 New -0.21423665 +-3.9746375 Zealand -0.098042734 +-3.9746375 enabled -0.098042734 +-3.9746375 continued -0.098042734 +-3.5348082 tertiary -0.14086568 +-3.9746375 monthly -0.098042734 +-3.9746375 housing -0.098042734 +-3.9746375 teenagers -0.098042734 +-3.9746375 move -0.098042734 +-3.9746375 spread -0.098042734 +-3.9746375 wings -0.098042734 +-3.9746375 Ultimately -0.098042734 +-3.678229 wanted -0.098042734 +-3.9746375 prove -0.098042734 +-3.9746375 yourself -0.098042734 +-3.9746375 Naturally -0.098042734 +-3.678229 entertainment -0.17195599 +-3.9746375 sponsored -0.098042734 +-3.9746375 covered -0.098042734 +-3.9746375 wasn -0.098042734 +-3.8506508 left -0.098042734 +-3.9746375 During -0.098042734 +-3.9746375 gas -0.098042734 +-3.9746375 station -0.098042734 +-3.8506508 attendant -0.21423665 +-3.9746375 kitchen -0.098042734 +-3.8506508 video -0.098042734 +-3.9746375 rental -0.098042734 +-3.9746375 clerk -0.098042734 +-3.9746375 parking -0.098042734 +-3.5348082 finally -0.098042734 +-3.9746375 porter -0.098042734 +-3.9746375 provided -0.098042734 +-3.9746375 communicative -0.098042734 +-3.9746375 organize -0.098042734 +-3.9746375 efficiently -0.098042734 +-3.678229 opinions -0.17195599 +-3.8506508 hands -0.21423665 +-3.8506508 textbook -0.21423665 +-3.9746375 selected -0.098042734 +-3.9746375 programs -0.098042734 +-3.9746375 seeking -0.098042734 +-3.9746375 unsure -0.098042734 +-3.9746375 undecided -0.098042734 +-3.9746375 direction -0.098042734 +-3.9746375 options -0.098042734 +-3.9746375 brighter -0.098042734 +-3.9746375 Third -0.098042734 +-3.9746375 inspire -0.098042734 +-3.9746375 excel -0.098042734 +-3.9746375 senior -0.098042734 +-3.9746375 Work -0.098042734 +-3.9746375 comprehend -0.098042734 +-3.9746375 Based -0.098042734 +-3.9746375 discovering -0.098042734 +-3.9746375 lifetime -0.098042734 +-3.9746375 helping -0.098042734 +-3.9746375 unmotivated -0.098042734 +-3.9746375 sadder -0.098042734 +-3.9746375 story -0.098042734 +-3.678229 finished -0.098042734 +-3.9746375 16 -0.098042734 +-3.9746375 run -0.098042734 +-3.8506508 challenge -0.098042734 +-3.9746375 tried -0.098042734 +-3.4271748 dealing -0.37108302 +-3.9746375 boss -0.098042734 +-3.9746375 Those -0.098042734 +-3.9746375 Parents -0.098042734 +-3.9746375 Doing -0.098042734 +-3.9746375 shows -0.098042734 +-3.9746375 longer -0.098042734 +-3.8506508 ourselves -0.098042734 +-3.9746375 taught -0.098042734 +-3.9746375 special -0.098042734 +-3.9746375 colleagues -0.098042734 +-3.9746375 grow -0.098042734 +-3.9746375 quicker -0.098042734 +-3.9746375 glife -0.098042734 +-3.678229 kids -0.098042734 +-3.9746375 correct -0.098042734 +-3.9746375 Throughout -0.098042734 +-3.9746375 grandparents -0.098042734 +-3.9746375 youngsters -0.098042734 +-3.9746375 consumerist -0.098042734 +-3.9746375 cell -0.098042734 +-3.9746375 phones -0.098042734 +-3.9746375 funded -0.098042734 +-3.9746375 possession -0.098042734 +-3.9746375 games -0.098042734 +-3.8506508 clothes -0.098042734 +-3.8506508 wage -0.098042734 +-3.8506508 communicating -0.098042734 +-3.5348082 groups -0.098042734 +-3.8506508 etiquette -0.098042734 +-3.9746375 spoken -0.098042734 +-3.9746375 language -0.098042734 +-3.678229 wish -0.274173 +-3.9746375 teacher -0.098042734 +-3.9746375 nursery -0.098042734 +-3.9746375 cram -0.098042734 +-3.9746375 tourism -0.098042734 +-3.9746375 hotels -0.098042734 +-3.9746375 tour -0.098042734 +-3.9746375 guides -0.098042734 +-3.678229 weekends -0.098042734 +-3.9746375 challenging -0.098042734 +-3.9746375 receiving -0.098042734 +-3.8506508 thousands -0.21423665 +-3.9746375 key -0.098042734 +-3.9746375 leading -0.098042734 +-3.9746375 join -0.098042734 +-3.678229 attending -0.1522653 +-3.678229 obtaining -0.17195599 +-3.9746375 time-management -0.098042734 +-3.9746375 semi-regular -0.098042734 +-3.9746375 regimen -0.098042734 +-3.9746375 oppressive -0.098042734 +-3.9746375 thought-controlling -0.098042734 +-3.9746375 atmosphere -0.098042734 +-3.9746375 accustomed -0.098042734 +-3.9746375 shock -0.098042734 +-3.9746375 cloistered -0.098042734 +-3.9746375 fascinating -0.098042734 +-3.8506508 exists -0.098042734 +-3.9746375 recommended -0.098042734 +-3.9746375 norms -0.098042734 +-3.9746375 contemporary -0.098042734 +-3.678229 thought -0.098042734 +-3.9746375 patterns -0.098042734 +-3.9746375 apply -0.098042734 +-3.9746375 mocked -0.098042734 +-3.9746375 absurd -0.098042734 +-3.9746375 occupying -0.098042734 +-3.9746375 oneself -0.098042734 +-3.678229 activity -0.098042734 +-3.9746375 restricts -0.098042734 +-3.4271748 habits -0.098042734 +-3.9746375 deleterious -0.098042734 +-3.9746375 Said -0.098042734 +-3.9746375 extended -0.098042734 +-3.9746375 periods -0.098042734 +-3.9746375 game -0.098042734 +-3.9746375 playing -0.098042734 +-3.9746375 lounging -0.098042734 +-3.9746375 sofa -0.098042734 +-3.9746375 watching -0.098042734 +-3.9746375 daytime -0.098042734 +-3.9746375 television -0.098042734 +-3.9746375 indulging -0.098042734 +-3.9746375 repeated -0.098042734 +-3.9746375 bouts -0.098042734 +-3.9746375 binge -0.098042734 +-3.9746375 equally -0.098042734 +-3.9746375 dissolute -0.098042734 +-3.9746375 obsessive -0.098042734 +-3.9746375 devotion -0.098042734 +-3.678229 online -0.098042734 +-3.9746375 fleshpots -0.098042734 +-3.9746375 promising -0.098042734 +-3.8506508 sexual -0.098042734 +-3.9746375 release -0.098042734 +-3.9746375 exchange -0.098042734 +-3.9746375 split -0.098042734 +-3.9746375 behooves -0.098042734 +-3.9746375 arrange -0.098042734 +-3.9746375 bear -0.098042734 +-3.9746375 mentioning -0.098042734 +-3.9746375 One -0.098042734 +-3.9746375 REAL -0.098042734 +-3.9746375 Nor -0.098042734 +-3.678229 efforts -0.1522653 +-3.9746375 life-lesson -0.098042734 +-3.9746375 tie-in -0.098042734 +-3.9746375 failure -0.098042734 +-3.9746375 disciplines -0.098042734 +-3.9746375 achieve -0.098042734 +-3.9746375 greatest -0.098042734 +-3.9746375 assets -0.098042734 +-3.9746375 control -0.098042734 +-3.8506508 desires -0.21423665 +-3.9746375 depend -0.098042734 +-3.9746375 track -0.098042734 +-3.8506508 spoiled -0.098042734 +-3.678229 paid -0.17195599 +-3.9746375 much-needed -0.098042734 +-3.9746375 self -0.098042734 +-3.8506508 transitional -0.21423665 +-3.9746375 dependence -0.098042734 +-3.678229 shall -0.098042734 +-3.9746375 claim -0.098042734 +-3.9746375 domestic -0.098042734 +-3.9746375 front -0.098042734 +-3.9746375 leaves -0.098042734 +-3.9746375 security -0.098042734 +-3.9746375 unit -0.098042734 +-3.678229 begins -0.17195599 +-3.9746375 strike -0.098042734 +-3.9746375 course-load -0.098042734 +-3.8506508 thoughts -0.21423665 +-3.9746375 pecuniary -0.098042734 +-3.8506508 changing -0.098042734 +-3.8506508 hobbies -0.098042734 +-3.9746375 assert -0.098042734 +-3.9746375 starting -0.098042734 +-3.9746375 Whether -0.098042734 +-3.9746375 vary -0.098042734 +-3.9746375 courting -0.098042734 +-3.8506508 meeting -0.098042734 +-3.9746375 viable -0.098042734 +-3.9746375 whom -0.098042734 +-3.9746375 Independent -0.098042734 +-3.8506508 supporting -0.098042734 +-3.9746375 stated -0.098042734 +-3.9746375 fresh -0.098042734 +-3.9746375 firm -0.098042734 +-3.9746375 superiors -0.098042734 +-3.9746375 life-skill -0.098042734 +-3.9746375 enhancements -0.098042734 +-3.9746375 Searching -0.098042734 +-3.9746375 filling -0.098042734 +-3.9746375 interview -0.098042734 +-3.9746375 joys -0.098042734 +-3.9746375 pain -0.098042734 +-3.9746375 rejected -0.098042734 +-3.9746375 cooperation -0.098042734 +-3.9746375 humble -0.098042734 +-3.9746375 relieve -0.098042734 +-3.9746375 funding -0.098042734 +-3.8506508 involved -0.21423665 +-3.9746375 emphatically -0.098042734 +-3.9746375 style -0.098042734 +-3.8506508 individuals -0.098042734 +-3.9746375 adulthood -0.098042734 +-3.9746375 indispensable -0.098042734 +-3.9746375 recommend -0.098042734 +-3.9746375 opportune -0.098042734 +-3.9746375 men -0.098042734 +-3.8506508 women -0.098042734 +-3.9746375 THEIR -0.098042734 +-3.9746375 interpretation -0.098042734 +-3.9746375 wonderful -0.098042734 +-3.9746375 meaning -0.098042734 +-3.8506508 applied -0.098042734 +-3.9746375 exactly -0.098042734 +-3.9746375 Dairy -0.098042734 +-3.9746375 Queens -0.098042734 +-3.9746375 Well -0.098042734 +-3.8506508 handling -0.098042734 +-3.9746375 accounts -0.098042734 +-3.9746375 add -0.098042734 +-3.8506508 comes -0.098042734 +-3.8506508 indeed -0.098042734 +-3.9746375 reduces -0.098042734 +-3.9746375 note -0.098042734 +-3.8506508 throughout -0.098042734 +-3.9746375 Interestingly -0.098042734 +-3.9746375 exercised -0.098042734 +-3.678229 greater -0.098042734 +-3.9746375 defray -0.098042734 +-3.9746375 ever-increasing -0.098042734 +-3.9746375 After -0.098042734 +-3.8506508 creates -0.098042734 +-3.9746375 burdens -0.098042734 +-3.8506508 newly -0.098042734 +-3.9746375 minted -0.098042734 +-3.9746375 Reducing -0.098042734 +-3.9746375 eliminating -0.098042734 +-3.9746375 state -0.098042734 +-3.8506508 population -0.098042734 +-3.9746375 contact -0.098042734 +-3.9746375 walks -0.098042734 +-3.9746375 myriad -0.098042734 +-3.9746375 cwhich -0.098042734 +-3.9746375 isn -0.098042734 +-3.9746375 couldn -0.098042734 +-3.9746375 shocked -0.098042734 +-3.9746375 perfected -0.098042734 +-3.9746375 begging -0.098042734 +-3.8506508 outright -0.098042734 +-3.9746375 demanding -0.098042734 +-3.9746375 shell -0.098042734 +-3.9746375 outrageous -0.098042734 +-3.8506508 amounts -0.21423665 +-3.8506508 nothing -0.098042734 +-3.8506508 wrong -0.098042734 +-3.9746375 ski -0.098042734 +-3.9746375 trips -0.098042734 +-3.9746375 traveling -0.098042734 +-3.8506508 enjoying -0.098042734 +-3.678229 probably -0.17195599 +-3.9746375 dollars -0.098042734 +-3.9746375 namely -0.098042734 +-3.9746375 consequently -0.098042734 +-3.8506508 institutes -0.098042734 +-3.8506508 household -0.098042734 +-3.9746375 parental -0.098042734 +-3.9746375 accommodation -0.098042734 +-3.9746375 financed -0.098042734 +-3.9746375 guardians -0.098042734 +-3.9746375 sum -0.098042734 +-3.9746375 subsidized -0.098042734 +-3.8506508 third -0.098042734 +-3.9746375 adequate -0.098042734 +-3.9746375 expenditure -0.098042734 +-3.9746375 yields -0.098042734 +-3.8506508 double -0.098042734 +-3.9746375 bonus -0.098042734 +-3.9746375 character-building -0.098042734 +-3.9746375 aspect -0.098042734 +-3.9746375 influences -0.098042734 +-3.9746375 younger -0.098042734 +-3.9746375 earners -0.098042734 +-3.9746375 professions -0.098042734 +-3.9746375 terms -0.098042734 +-3.9746375 diligence -0.098042734 +-3.678229 said -0.17195599 +-3.8506508 beginning -0.098042734 +-3.8506508 relief -0.098042734 +-3.5348082 due -0.47831634 +-3.9746375 shown -0.098042734 +-3.9746375 traumatizing -0.098042734 +-3.9746375 totally -0.098042734 +-3.9746375 pretty -0.098042734 +-3.9746375 similarities -0.098042734 +-3.678229 essentially -0.098042734 +-3.9746375 sit -0.098042734 +-3.9746375 quietly -0.098042734 +-3.9746375 feet -0.098042734 +-3.9746375 spontaneous -0.098042734 +-3.9746375 feedback -0.098042734 +-3.9746375 aren -0.098042734 +-3.9746375 dimension -0.098042734 +-3.9746375 perform -0.098042734 +-3.9746375 somewhat -0.098042734 +-3.9746375 lacking -0.098042734 +-3.8506508 anyway -0.098042734 +-3.8506508 remember -0.098042734 +-3.9746375 paycheck -0.098042734 +-3.9746375 impression -0.098042734 +-3.9746375 gave -0.098042734 +-3.9746375 seemed -0.098042734 +-3.9746375 peculiar -0.098042734 +-3.9746375 sort -0.098042734 +-3.9746375 cautious -0.098042734 +-3.9746375 apt -0.098042734 +-3.8506508 Yet -0.098042734 +-3.9746375 guess -0.098042734 +-3.8506508 primary -0.21423665 +-3.9746375 Anything -0.098042734 +-3.9746375 membership -0.098042734 +-3.9746375 supplementary -0.098042734 +-3.9746375 Extra -0.098042734 +-3.9746375 fast-food -0.098042734 +-3.9746375 hopes -0.098042734 +-3.9746375 computer -0.098042734 +-3.9746375 programmer -0.098042734 +-3.8506508 burger -0.098042734 +-3.9746375 flipping -0.098042734 +-3.9746375 decidedly -0.098042734 +-3.4271748 negative -0.14086568 +-3.9746375 first-year -0.098042734 +-3.9746375 stretched -0.098042734 +-3.9746375 thin -0.098042734 +-3.9746375 college-level -0.098042734 +-3.8506508 negligible -0.098042734 +-3.9746375 fine -0.098042734 +-3.9746375 take-up -0.098042734 +-3.8506508 solely -0.098042734 +-3.8506508 ' -0.098042734 +-3.9746375 discretion -0.098042734 +-3.9746375 sharpen -0.098042734 +-3.9746375 in-class -0.098042734 +-3.9746375 supplemented -0.098042734 +-3.8506508 experiments -0.21423665 +-3.9746375 casework -0.098042734 +-3.8506508 virtually -0.098042734 +-3.8506508 impossible -0.21423665 +-3.9746375 heavy -0.098042734 +-3.9746375 detriment -0.098042734 +-3.8506508 physical -0.098042734 +-2.9536004 health -0.1744344 +-3.9746375 Everyone -0.098042734 +-3.9746375 applies -0.098042734 +-3.9746375 extracurricular -0.098042734 +-3.9746375 What -0.098042734 +-3.9746375 picking -0.098042734 +-3.5348082 healthy -0.098042734 +-3.9746375 devoting -0.098042734 +-3.9746375 exercising -0.098042734 +-3.8506508 recreational -0.098042734 +-3.9746375 resumes -0.098042734 +-3.9746375 well-rounded -0.098042734 +-3.9746375 introduced -0.098042734 +-3.9746375 tedious -0.098042734 +-3.5348082 mind -0.098042734 +-3.9746375 curious -0.098042734 +-3.9746375 phase -0.098042734 +-3.9746375 encountering -0.098042734 +-3.9746375 share -0.098042734 +-3.9746375 swimming -0.098042734 +-3.9746375 chemistry -0.098042734 +-3.9746375 light -0.098042734 +-3.9746375 bulb -0.098042734 +-3.9746375 cherished -0.098042734 +-3.8506508 miss -0.098042734 +-3.5348082 ideas -0.1522653 +-3.9746375 academics -0.098042734 +-3.9746375 dormitory -0.098042734 +-3.8506508 offered -0.098042734 +-3.678229 exposed -0.35337758 +-3.9746375 group -0.098042734 +-3.9746375 peers -0.098042734 +-3.9746375 Living -0.098042734 +-3.9746375 sharing -0.098042734 +-3.8506508 dorm -0.098042734 +-3.9746375 permanent -0.098042734 +-3.9746375 bond -0.098042734 +-3.678229 behind -0.098042734 +-3.8506508 won -0.098042734 +-3.9746375 ` -0.098042734 +-3.9746375 t -0.098042734 +-3.9746375 participate -0.098042734 +-3.9746375 awareness -0.098042734 +-3.9746375 worried -0.098042734 +-3.9746375 Personally -0.098042734 +-3.9746375 terribly -0.098042734 +-3.9746375 starving -0.098042734 +-3.9746375 deem -0.098042734 +-3.8506508 dollar -0.098042734 +-3.9746375 Don -0.098042734 +-3.678229 40 -0.098042734 +-3.9746375 45 -0.098042734 +-3.8506508 exams -0.098042734 +-3.9746375 clicks -0.098042734 +-3.9746375 drugs -0.098042734 +-3.9746375 rejection -0.098042734 +-3.9746375 thrown -0.098042734 +-3.9746375 dogs -0.098042734 +-3.9746375 remainder -0.098042734 +-3.9746375 paths -0.098042734 +-3.8506508 distracting -0.098042734 +-3.9746375 original -0.098042734 +-3.9746375 decides -0.098042734 +-3.9746375 deter -0.098042734 +-3.9746375 he\/she -0.098042734 +-3.9746375 energies -0.098042734 +-3.9746375 conducive -0.098042734 +-3.9746375 decisions -0.098042734 +-3.9746375 open -0.098042734 +-3.9746375 Failure -0.098042734 +-3.8506508 larger -0.098042734 +-3.9746375 possibility -0.098042734 +-3.9746375 administration -0.098042734 +-3.9746375 treating -0.098042734 +-3.9746375 Consequently -0.098042734 +-3.9746375 poses -0.098042734 +-3.9746375 threat -0.098042734 +-3.9746375 Money -0.098042734 +-3.9746375 distracter -0.098042734 +-3.9746375 boils -0.098042734 +-3.678229 down -0.17195599 +-3.9746375 networking -0.098042734 +-3.9746375 landing -0.098042734 +-3.9746375 ideal -0.098042734 +-3.5348082 properly -0.098042734 +-3.9746375 Training -0.098042734 +-3.9746375 train -0.098042734 +-3.9746375 temporary -0.098042734 +-3.8506508 pursuit -0.21423665 +-3.9746375 Any -0.098042734 +-3.9746375 lay -0.098042734 +-3.9746375 forgo -0.098042734 +-3.9746375 physically -0.098042734 +-3.9746375 forms -0.098042734 +-3.8506508 exhaustion -0.098042734 +-3.9746375 interferes -0.098042734 +-3.9746375 offering -0.098042734 +-3.9746375 weighed -0.098042734 +-3.9746375 crisis -0.098042734 +-3.8506508 middle -0.098042734 +-3.9746375 aged -0.098042734 +-3.9746375 couples -0.098042734 +-3.9746375 elderly -0.098042734 +-3.678229 heavily -0.098042734 +-3.9746375 severely -0.098042734 +-3.9746375 disparate -0.098042734 +-3.9746375 Currently -0.098042734 +-3.9746375 Why -0.098042734 +-3.9746375 dropout -0.098042734 +-3.9746375 losing -0.098042734 +-3.9746375 trapped -0.098042734 +-3.9746375 image -0.098042734 +-3.9746375 goods -0.098042734 +-3.9746375 exciting -0.098042734 +-3.8506508 versus -0.098042734 +-3.9746375 boring -0.098042734 +-3.9746375 uninteresting -0.098042734 +-3.9746375 Communication -0.098042734 +-3.9746375 media -0.098042734 +-3.9746375 typical -0.098042734 +-3.8506508 smaller -0.098042734 +-3.8506508 possibly -0.098042734 +-3.9746375 gather -0.098042734 +-3.8506508 seek -0.098042734 +-3.8506508 establishments -0.098042734 +-3.8506508 bars -0.098042734 +-3.8506508 nightclubs -0.098042734 +-3.8506508 obvious -0.098042734 +-3.9746375 attracted -0.098042734 +-3.8506508 lucrative -0.098042734 +-3.9746375 tips -0.098042734 +-3.8506508 Las -0.21423665 +-3.9746375 Vegas -0.098042734 +-3.9746375 casino -0.098042734 +-3.9746375 mesmerized -0.098042734 +-3.9746375 winning -0.098042734 +-3.9746375 tables -0.098042734 +-3.9746375 nationwide -0.098042734 +-3.9746375 boom -0.098042734 +-3.8506508 Texas -0.21423665 +-3.9746375 Hold -0.098042734 +-3.9746375 eEm -0.098042734 +-3.9746375 Poker -0.098042734 +-3.8506508 North -0.21423665 +-3.9746375 Recently -0.098042734 +-3.678229 poker -0.098042734 +-3.8506508 players -0.098042734 +-3.9746375 diploma -0.098042734 +-3.8506508 popular -0.098042734 +-3.9746375 evolved -0.098042734 +-3.9746375 skilled -0.098042734 +-3.9746375 luck -0.098042734 +-3.8506508 prize -0.21423665 +-3.9746375 trend -0.098042734 +-3.9746375 swept -0.098042734 +-3.9746375 continues -0.098042734 +-3.9746375 Playing -0.098042734 +-3.9746375 addictive -0.098042734 +-3.9746375 drug -0.098042734 +-3.9746375 You -0.098042734 +-3.9746375 access -0.098042734 +-3.9746375 24 -0.098042734 +-3.8506508 win -0.098042734 +-3.9746375 substantial -0.098042734 +-3.9746375 prizes -0.098042734 +-3.9746375 tournament -0.098042734 +-3.9746375 grand -0.098042734 +-3.9746375 incomplete -0.098042734 +-3.9746375 Assuming -0.098042734 +-3.9746375 worry -0.098042734 +-3.9746375 invest -0.098042734 +-3.9746375 respond -0.098042734 +-3.9746375 proud -0.098042734 +-3.9746375 plainly -0.098042734 +-3.9746375 divided -0.098042734 +-3.9746375 Troubles -0.098042734 +-3.8506508 goals -0.098042734 +-3.9746375 established -0.098042734 +-3.9746375 applying -0.098042734 +-3.9746375 brother -0.098042734 +-3.8506508 dropped -0.098042734 +-3.9746375 Northeastern -0.098042734 +-3.9746375 Massachusetts -0.098042734 +-3.9746375 majoring -0.098042734 +-3.9746375 physics -0.098042734 +-3.9746375 Disney -0.098042734 +-3.9746375 holiday -0.098042734 +-3.9746375 chances -0.098042734 +-3.678229 partying -0.098042734 +-3.9746375 serious -0.098042734 +-3.9746375 budgeted -0.098042734 +-3.9746375 stem -0.098042734 +-3.9746375 funds -0.098042734 +-3.9746375 100 -0.098042734 +-3.9746375 distracts -0.098042734 +-3.9746375 reasonable -0.098042734 +-3.153469 restaurants -0.16573042 +-3.9746375 whenever -0.098042734 +-3.9746375 vicious -0.098042734 +-3.9746375 postpone -0.098042734 +-3.9746375 greal -0.098042734 +-3.9746375 housework -0.098042734 +-3.9746375 chores -0.098042734 +-3.9746375 Governments -0.098042734 +-3.4271748 ban -0.3119509 +-3.9746375 employ -0.098042734 +-3.9746375 Advising -0.098042734 +-3.9746375 possess -0.098042734 +-3.9746375 Mammon -0.098042734 +-3.9746375 glittering -0.098042734 +-3.9746375 bounty -0.098042734 +-3.9746375 wrong-headed -0.098042734 +-3.9746375 short-sighted -0.098042734 +-3.9746375 Anyone -0.098042734 +-3.9746375 known -0.098042734 +-3.9746375 appreciates -0.098042734 +-3.9746375 acquisition -0.098042734 +-3.9746375 Interrupting -0.098042734 +-3.9746375 headlong -0.098042734 +-3.9746375 charge -0.098042734 +-3.9746375 enlightenment -0.098042734 +-3.9746375 demons -0.098042734 +-3.8506508 man -0.098042734 +-3.9746375 bursary -0.098042734 +-3.9746375 Beware -0.098042734 +-3.9746375 extol -0.098042734 +-3.9746375 Their -0.098042734 +-3.9746375 reckoning -0.098042734 +-3.9746375 waking -0.098042734 +-3.9746375 digesting -0.098042734 +-3.9746375 dollops -0.098042734 +-3.9746375 nourishing -0.098042734 +-3.9746375 scholarship -0.098042734 +-3.9746375 fed -0.098042734 +-3.9746375 assistants -0.098042734 +-3.9746375 coined -0.098042734 +-3.9746375 dispensed -0.098042734 +-3.9746375 mastered -0.098042734 +-3.9746375 signs -0.098042734 +-3.9746375 dancing -0.098042734 +-3.9746375 raw -0.098042734 +-3.9746375 cupidity -0.098042734 +-3.9746375 corrupts -0.098042734 +-3.9746375 soul -0.098042734 +-3.9746375 contemplating -0.098042734 +-3.9746375 materialist -0.098042734 +-3.9746375 purchases -0.098042734 +-3.9746375 disturbs -0.098042734 +-3.9746375 Disrupting -0.098042734 +-3.9746375 evolving -0.098042734 +-3.9746375 relationship -0.098042734 +-3.9746375 saddling -0.098042734 +-3.9746375 questing -0.098042734 +-3.8506508 minimum -0.098042734 +-3.9746375 drudgery -0.098042734 +-3.9746375 interfering -0.098042734 +-3.9746375 ceaseless -0.098042734 +-3.9746375 cogitation -0.098042734 +-3.9746375 embraces -0.098042734 +-3.9746375 obscene -0.098042734 +-3.9746375 perversion -0.098042734 +-3.9746375 holy -0.098042734 +-3.9746375 Never -0.098042734 +-3.9746375 Again -0.098042734 +-3.9746375 carrels -0.098042734 +-3.9746375 undivided -0.098042734 +-3.8506508 attention -0.098042734 +-3.9746375 across -0.098042734 +-3.9746375 barricades -0.098042734 +-3.9746375 gGreat -0.098042734 +-3.9746375 Works -0.098042734 +-3.8506508 \ -0.098042734 +-3.9746375 rightly -0.098042734 +-3.9746375 unread -0.098042734 +-3.9746375 relentless -0.098042734 +-3.9746375 racism -0.098042734 +-3.9746375 misogyny -0.098042734 +-3.9746375 cobblestones -0.098042734 +-3.9746375 unite -0.098042734 +-3.9746375 shape -0.098042734 +-3.9746375 passing -0.098042734 +-3.9746375 examinations -0.098042734 +-3.9746375 stores -0.098042734 +-3.9746375 awake -0.098042734 +-3.9746375 failing -0.098042734 +-3.9746375 asked -0.098042734 +-3.9746375 obstructs -0.098042734 +-3.9746375 discouraged -0.098042734 +-3.9746375 tool -0.098042734 +-3.9746375 sons -0.098042734 +-3.9746375 daughters -0.098042734 +-3.9746375 dreams -0.098042734 +-3.9746375 dashed -0.098042734 +-3.9746375 behalf -0.098042734 +-3.9746375 hurting -0.098042734 +-3.9746375 destroying -0.098042734 +-3.9746375 low-skilled -0.098042734 +-3.9746375 flip -0.098042734 +-3.9746375 whatsoever -0.098042734 +-3.9746375 pertain -0.098042734 +-3.8506508 U.S. -0.098042734 +-3.9746375 voice -0.098042734 +-3.9746375 admitted -0.098042734 +-3.9746375 Conversely -0.098042734 +-3.9746375 speaking -0.098042734 +-3.9746375 United -0.098042734 +-3.9746375 States -0.098042734 +-3.9746375 consuming -0.098042734 +-3.9746375 commuting -0.098042734 +-3.8506508 merely -0.098042734 +-3.9746375 obligatory -0.098042734 +-3.9746375 sustain -0.098042734 +-3.9746375 existence -0.098042734 +-3.678229 eat -0.098042734 +-3.8506508 opposed -0.21423665 +-3.9746375 taken-on -0.098042734 +-3.9746375 compromised -0.098042734 +-3.9746375 excellence -0.098042734 +-3.8506508 finishing -0.098042734 +-3.9746375 sake -0.098042734 +-3.9746375 burdened -0.098042734 +-3.9746375 fiscal -0.098042734 +-3.9746375 none -0.098042734 +-3.9746375 assume -0.098042734 +-3.9746375 academically -0.098042734 +-3.9746375 minded -0.098042734 +-3.9746375 high-school -0.098042734 +-3.9746375 focuses -0.098042734 +-3.9746375 mastering -0.098042734 +-3.9746375 foremost -0.098042734 +-3.9746375 concepts -0.098042734 +-3.9746375 detail -0.098042734 +-3.8506508 depth -0.098042734 +-3.9746375 analysis -0.098042734 +-3.8506508 mundane -0.098042734 +-3.678229 holding -0.17195599 +-3.8506508 distract -0.098042734 +-3.9746375 hamstring -0.098042734 +-3.9746375 efficacy -0.098042734 +-3.9746375 himself -0.098042734 +-3.9746375 confident -0.098042734 +-3.9746375 His -0.098042734 +-3.9746375 plenty -0.098042734 +-3.9746375 negatively -0.098042734 +-3.9746375 term -0.098042734 +-3.8506508 past -0.098042734 +-3.9746375 father -0.098042734 +-3.9746375 grandfather -0.098042734 +-3.9746375 supply -0.098042734 +-3.9746375 modest -0.098042734 +-3.8506508 five -0.098042734 +-3.9746375 wife -0.098042734 +-3.9746375 carrying -0.098042734 +-3.9746375 entrance -0.098042734 +-3.9746375 work-force -0.098042734 +-3.9746375 coupled -0.098042734 +-3.8506508 near -0.098042734 +-3.9746375 adequately -0.098042734 +-3.9746375 four-year -0.098042734 +-3.9746375 woman -0.098042734 +-3.9746375 dedicating -0.098042734 +-3.9746375 Achieving -0.098042734 +-3.9746375 marks -0.098042734 +-3.9746375 devoted -0.098042734 +-3.9746375 safe-guarded -0.098042734 +-3.9746375 maximum -0.098042734 +-3.9746375 diluted -0.098042734 +-3.8506508 worries -0.098042734 +-3.9746375 associated -0.098042734 +-3.9746375 hold-down -0.098042734 +-3.9746375 Let -0.098042734 +-3.9746375 ultimate -0.098042734 +-3.9746375 anybody -0.098042734 +-3.9746375 advances -0.098042734 +-3.8506508 distracted -0.098042734 +-3.9746375 falter -0.098042734 +-3.9746375 schoolwork -0.098042734 +-3.9746375 standing -0.098042734 +-3.9746375 hurry -0.098042734 +-3.9746375 pressured -0.098042734 +-3.9746375 running -0.098042734 +-3.9746375 overwhelmed -0.098042734 +-3.9746375 strapped -0.098042734 +-3.9746375 reliance -0.098042734 +-3.9746375 behoove -0.098042734 +-3.9746375 valid -0.098042734 +-3.9746375 counter-argument -0.098042734 +-3.9746375 paramount -0.098042734 +-3.9746375 doctor -0.098042734 +-3.9746375 tolerate -0.098042734 +-3.9746375 repay -0.098042734 +-3.9746375 seriously -0.098042734 +-3.9746375 jeopardize -0.098042734 +-3.9746375 coin -0.098042734 +-3.9746375 vogue -0.098042734 +-3.8506508 expression -0.098042734 +-3.9746375 gI -0.098042734 +-3.9746375 robbing -0.098042734 +-3.9746375 Peter -0.098042734 +-3.9746375 Paul -0.098042734 +-3.9746375 h. -0.098042734 +-3.9746375 rests -0.098042734 +-3.9746375 onus -0.098042734 +-3.9746375 innumerable -0.098042734 +-3.9746375 enlightened -0.098042734 +-3.9746375 18 -0.098042734 +-3.9746375 liked -0.098042734 +-3.8506508 nonetheless -0.098042734 +-3.5348082 approach -0.098042734 +-3.9746375 graduated -0.098042734 +-3.9746375 bills -0.098042734 +-3.9746375 smooth -0.098042734 +-3.5348082 here -0.098042734 +-3.9746375 started -0.098042734 +-3.8506508 aside -0.098042734 +-3.9746375 etime -0.098042734 +-3.9746375 workloads -0.098042734 +-3.9746375 volume -0.098042734 +-3.9746375 shift -0.098042734 +-3.9746375 afterwards -0.098042734 +-3.9746375 ehealthy -0.098042734 +-3.9746375 body -0.098042734 +-3.9746375 rings -0.098042734 +-3.9746375 fatigued -0.098042734 +-3.9746375 over-exertion -0.098042734 +-3.9746375 weekend -0.098042734 +-3.9746375 availability -0.098042734 +-3.9746375 negates -0.098042734 +-3.9746375 5-day -0.098042734 +-3.9746375 6 -0.098042734 +-3.9746375 compounded -0.098042734 +-3.9746375 single -0.098042734 +-3.9746375 relaxation -0.098042734 +-3.9746375 held -0.098042734 +-3.9746375 relied -0.098042734 +-3.9746375 else -0.098042734 +-3.9746375 utilize -0.098042734 +-3.9746375 Obliging -0.098042734 +-3.9746375 detrimental -0.098042734 +-3.9746375 outlook -0.098042734 +-3.9746375 feelings -0.098042734 +-3.9746375 depression -0.098042734 +-3.9746375 pessimism -0.098042734 +-3.9746375 honest -0.098042734 +-3.9746375 hadn -0.098042734 +-3.9746375 Indeed -0.098042734 +-3.9746375 perspective -0.098042734 +-3.9746375 Before -0.098042734 +-3.9746375 30 -0.098042734 +-3.9746375 hardly -0.098042734 +-3.9746375 catch -0.098042734 +-3.9746375 ended -0.098042734 +-3.9746375 cars -0.098042734 +-3.8506508 fancy -0.21423665 +-3.9746375 cigarettes -0.098042734 +-3.9746375 beer -0.098042734 +-3.9746375 drop -0.098042734 +-3.9746375 interviewed -0.098042734 +-3.9746375 cared -0.098042734 +-3.9746375 bartender -0.098042734 +-3.9746375 impress -0.098042734 +-3.9746375 anti-smoking -0.098042734 +-3.9746375 lag -0.098042734 +-3.8506508 significantly -0.098042734 +-3.9746375 nations -0.098042734 +-3.9746375 progress -0.098042734 +-3.8506508 total -0.098042734 +-2.7521589 smoking -0.20912977 +-3.9746375 undoubtedly -0.098042734 +-3.9746375 resistance -0.098042734 +-3.8506508 restrictions -0.21423665 +-3.9746375 Implementing -0.098042734 +-3.9746375 represent -0.098042734 +-3.9746375 opposite -0.098042734 +-3.9746375 Between -0.098042734 +-3.9746375 polar -0.098042734 +-3.9746375 extremes -0.098042734 +-3.9746375 route -0.098042734 +-3.9746375 advisable -0.098042734 +-3.9746375 urge -0.098042734 +-2.9868817 smoke -0.20451537 +-3.8506508 regardless -0.098042734 +-3.9746375 well-being -0.098042734 +-3.9746375 considering -0.098042734 +-3.8506508 rights -0.098042734 +-3.9746375 concerns -0.098042734 +-3.678229 places -0.098042734 +-3.9746375 partial -0.098042734 +-3.4271748 banning -0.37108302 +-3.9746375 implemented -0.098042734 +-3.9746375 Restaurants -0.098042734 +-3.9746375 legally -0.098042734 +-3.9746375 compelled -0.098042734 +-3.8506508 separate -0.098042734 +-3.9746375 ventilated -0.098042734 +-3.9746375 room -0.098042734 +-3.8506508 patrons -0.098042734 +-3.9746375 alternatively -0.098042734 +-3.9746375 renovations -0.098042734 +-3.678229 non-smoking -0.098042734 +-3.9746375 establishment -0.098042734 +-3.9746375 built -0.098042734 +-3.9746375 facilities -0.098042734 +-3.9746375 lax -0.098042734 +-3.9746375 standards -0.098042734 +-3.9746375 meal -0.098042734 +-3.9746375 exposing -0.098042734 +-3.9746375 cancer -0.098042734 +-3.9746375 asthma -0.098042734 +-3.9746375 respiratory -0.098042734 +-3.9746375 worst -0.098042734 +-3.9746375 sufferers -0.098042734 +-3.8506508 ill -0.098042734 +-3.5348082 effects -0.47831634 +-3.9746375 Smoking -0.098042734 +-3.9746375 alone -0.098042734 +-3.9746375 kills -0.098042734 +-3.9746375 hundreds -0.098042734 +-3.9746375 sick -0.098042734 +-3.9746375 nowadays -0.098042734 +-3.8506508 everywhere -0.098042734 +-3.9746375 Inside -0.098042734 +-3.8506508 breathing -0.21423665 +-3.9746375 appetite -0.098042734 +-3.9746375 smells -0.098042734 +-3.9746375 harm -0.098042734 +-3.9746375 Nothing -0.098042734 +-3.9746375 justify -0.098042734 +-3.8506508 interference -0.098042734 +-3.8506508 justified -0.098042734 +-3.8506508 affect -0.098042734 +-3.9746375 moral -0.098042734 +-3.8506508 justification -0.098042734 +-3.9746375 @ -0.098042734 +-3.9746375 guide -0.098042734 +-3.678229 choices -0.098042734 +-3.9746375 warnings -0.098042734 +-3.9746375 campaigns -0.098042734 +-3.9746375 perfectly -0.098042734 +-3.9746375 passive -0.098042734 +-3.8506508 inhaling -0.098042734 +-3.9746375 unknowingly -0.098042734 +-3.9746375 affecting -0.098042734 +-3.9746375 Banning -0.098042734 +-3.9746375 premise -0.098042734 +-3.9746375 rational -0.098042734 +-3.9746375 employee -0.098042734 +-3.9746375 UK -0.098042734 +-3.9746375 parts -0.098042734 +-3.9746375 US -0.098042734 +-3.9746375 conscious -0.098042734 +-3.9746375 citizens -0.098042734 +-3.9746375 despite -0.098042734 +-3.9746375 infringement -0.098042734 +-3.678229 second-hand -0.35337758 +-3.5348082 non-smokers -0.1522653 +-3.9746375 Family -0.098042734 +-3.9746375 prone -0.098042734 +-3.5348082 smokers -0.14086568 +-3.9746375 Keeping -0.098042734 +-3.9746375 smoke-free -0.098042734 +-3.9746375 occur -0.098042734 +-3.9746375 Putting -0.098042734 +-3.9746375 potentially -0.098042734 +-3.9746375 unwanted -0.098042734 +-3.8506508 exposure -0.21423665 +-3.678229 tobacco -0.098042734 +-3.9746375 air -0.098042734 +-3.9746375 definite -0.098042734 +-3.9746375 served -0.098042734 +-3.9746375 smell -0.098042734 +-3.9746375 entangled -0.098042734 +-3.9746375 tell -0.098042734 +-3.9746375 tastes -0.098042734 +-3.8506508 altogether -0.21423665 +-3.9746375 Smokers -0.098042734 +-3.9746375 welfare -0.098042734 +-3.9746375 includes -0.098042734 +-3.9746375 Tobacco -0.098042734 +-3.8506508 poison -0.098042734 +-3.9746375 anywhere -0.098042734 +-3.9746375 close -0.098042734 +-3.9746375 permeates -0.098042734 +-3.9746375 foods -0.098042734 +-3.9746375 breathe -0.098042734 +-3.9746375 alters -0.098042734 +-3.9746375 habit-forming -0.098042734 +-3.9746375 stimulant -0.098042734 +-3.9746375 react -0.098042734 +-3.9746375 violently -0.098042734 +-3.9746375 Customers -0.098042734 +-3.9746375 heard -0.098042734 +-3.8506508 smoky -0.098042734 +-3.9746375 fair -0.098042734 +-3.9746375 Restaurant -0.098042734 +-3.9746375 Regular -0.098042734 +-3.9746375 withdrawal -0.098042734 +-3.9746375 pains -0.098042734 +-3.9746375 addicted -0.098042734 +-3.9746375 Exposing -0.098042734 +-3.9746375 insult -0.098042734 +-3.9746375 safety -0.098042734 +-3.9746375 selfish -0.098042734 +-3.9746375 implementing -0.098042734 +-3.9746375 examined -0.098042734 +-3.9746375 concerning -0.098042734 +-3.9746375 Principally -0.098042734 +-3.8506508 wise -0.098042734 +-3.8506508 forced -0.21423665 +-3.9746375 compromise -0.098042734 +-3.9746375 regarding -0.098042734 +-3.9746375 lifestyles -0.098042734 +-3.9746375 discourage -0.098042734 +-3.9746375 importantly -0.098042734 +-3.9746375 account -0.098042734 +-3.9746375 smoke-filled -0.098042734 +-3.9746375 ignored -0.098042734 +-3.9746375 long-term -0.098042734 +-3.9746375 national -0.098042734 +-3.9746375 healthcare -0.098042734 +-3.9746375 treatment -0.098042734 +-3.9746375 tobacco-related -0.098042734 +-3.9746375 illnesses -0.098042734 +-3.9746375 context -0.098042734 +-3.9746375 firstly -0.098042734 +-3.9746375 promote -0.098042734 +-3.9746375 comfort -0.098042734 +-3.9746375 diners -0.098042734 +-3.9746375 proven -0.098042734 +-3.9746375 amongst -0.098042734 +-3.9746375 ages -0.098042734 +-3.9746375 pastime -0.098042734 +-3.9746375 deeply -0.098042734 +-3.9746375 integrated -0.098042734 +-3.9746375 passively -0.098042734 +-3.9746375 contracting -0.098042734 +-3.9746375 illness -0.098042734 +-3.9746375 severe -0.098042734 +-3.9746375 profits -0.098042734 +-3.9746375 introduce -0.098042734 +-3.9746375 sections -0.098042734 +-3.9746375 segregate -0.098042734 +-3.9746375 suits -0.098042734 +-3.9746375 succeed -0.098042734 +-3.9746375 gradually -0.098042734 +-3.9746375 improving -0.098042734 +-3.9746375 clearly -0.098042734 +-3.9746375 harmful -0.098042734 +-3.9746375 secondhand -0.098042734 + +\2-grams: +-0.0012649981 . 0 +-0.05949721 ? 0 +-2.1446047 -RRB- 0 +-0.12845917 ! 0 +-1.7663178 individual 0 +-1.0685297 I -0.41439435 +-1.6155277 reasons I -0.11268411 +-2.3118994 that I -0.13618872 +-2.1031294 job I -0.04895735 +-1.3998553 , I -0.22112566 +-2.0044537 and I -0.048957378 +-1.6042035 as I -0.048957378 +-1.3682404 reason I -0.048957378 +-2.673871 students I -0.048957378 +-0.9847774 Therefore I -0.048957378 +-1.9911177 then I -0.187397 +-2.544403 college I -0.048957378 +-1.7705758 And I -0.048957378 +-2.1540918 so I -0.048957378 +-1.2360636 : I -0.048957378 +-1.6455997 employers I -0.048957378 +-1.8020827 like I -0.048957378 +-1.2064477 when I -0.11268411 +-1.6699961 what I -0.048957378 +-2.068887 could I -0.048957378 +-1.1479716 because I -0.048957378 +-1.9872543 If I -0.048957378 +-0.8012204 whatever I -0.048957378 +-1.7496611 responsibilities I -0.048957378 +-1.596395 which I -0.048957378 +-1.5678455 however I -0.048957378 +-1.9402759 where I -0.048957378 +-0.9847774 extent I -0.048957378 +-1.5031465 So I -0.048957378 +-1.2753446 Perhaps I -0.048957378 +-1.8586221 think I -0.048957378 +-1.0509765 why I -0.13618872 +-1.1556058 Australia I -0.048957378 +-0.9847774 summary I -0.048957378 +-0.6890748 Overall I -0.048957378 +-1.2753446 When I -0.04895735 +-0.6890748 justice I -0.048957378 +-1.282293 wish I -0.048957378 +-0.9847774 U.S. I -0.048957378 +-2.7368789 I disagree -0.37097517 +-0.99501497 partially disagree -0.048957378 +-0.7942637 disagree with -0.70552063 +-2.2168078 student with -0.048957378 +-2.002276 , with -0.048957378 +-1.413362 case with -0.048957378 +-2.277343 in with -0.048957378 +-2.114054 and with -0.04895735 +-1.8664051 working with -0.048957378 +-1.6857973 students with -0.04895735 +-2.3546855 time with -0.048957378 +-2.1668954 jobs with -0.048957378 +-2.3388495 work with -0.13618872 +-0.3862648 agree with -0.70552063 +-1.4221039 begin with -0.048957378 +-2.2686563 are with -0.048957378 +-1.52707 experiences with -0.048957378 +-0.9731246 team with -0.048957378 +-1.408685 productive with -0.048957378 +-0.93599224 relationships with -0.048957378 +-2.1311364 people with -0.048957378 +-1.5142454 true with -0.048957378 +-1.5826072 help with -0.04895735 +-2.1114128 do with -0.048957378 +-0.6831375 familiar with -0.048957378 +-0.40507883 interfere with -0.187397 +-1.8622386 activities with -0.048957378 +-1.7455719 life with -0.187397 +-1.4935277 up with -0.048957378 +-0.6831375 combines with -0.048957378 +-1.7117923 income with -0.048957378 +-1.7904192 living with -0.048957378 +-0.9731246 communicate with -0.048957378 +-1.52707 responsible with -0.048957378 +-0.9731246 flexible with -0.048957378 +-0.9731246 fill with -0.048957378 +-0.6831375 cope with -0.048957378 +-1.4540396 out with -0.04895735 +-1.7978033 those with -0.048957378 +-0.9731246 comparison with -0.048957378 +-1.4658345 problem with -0.048957378 +-1.5560237 families with -0.048957378 +-1.7279987 day with -0.048957378 +-0.7942637 drinking with -0.048957378 +-0.6831375 association with -0.048957378 +-0.6831375 remain with -0.048957378 +-0.9731246 organized with -0.048957378 +-1.1384469 basis with -0.048957378 +-0.6831375 Along with -0.048957378 +-1.1384469 schedules with -0.048957378 +-1.5187173 complete with -0.048957378 +-0.6831375 along with -0.048957378 +-1.56482 spent with -0.048957378 +-0.6831375 interact with -0.048957378 +-0.6831375 interacting with -0.048957378 +-1.5603997 me with -0.048957378 +-1.2528772 schedule with -0.048957378 +-0.6831375 continued with -0.048957378 +-0.6831375 efficiently with -0.048957378 +-0.6831375 helping with -0.048957378 +-0.2700946 dealing with -0.13332285 +-0.9731246 communicating with -0.048957378 +-0.6831375 oneself with -0.048957378 +-1.408685 habits with -0.048957378 +-0.6831375 tie-in with -0.048957378 +-0.9731246 claim with -0.048957378 +-0.6831375 cooperation with -0.048957378 +-0.9731246 wrong with -0.048957378 +-0.6831375 bonus with -0.048957378 +-0.6831375 interferes with -0.048957378 +-0.6831375 trapped with -0.048957378 +-0.6831375 boom with -0.048957378 +-1.8179587 restaurants with -0.048957378 +-0.6831375 relationship with -0.048957378 +-0.6831375 interfering with -0.048957378 +-0.6831375 burdened with -0.048957378 +-0.6831375 coupled with -0.048957378 +-0.6831375 diluted with -0.048957378 +-0.6831375 associated with -0.048957378 +-0.6831375 rests with -0.048957378 +-0.6831375 held with -0.048957378 +-0.6831375 interviewed with -0.048957378 +-0.6831375 implemented with -0.048957378 +-0.6831375 entangled with -0.048957378 +-2.0696561 with this -0.73184955 +-2.0768657 for this -0.1722646 +-2.4861102 job this -0.048957378 +-1.6605835 upon this -0.048957378 +-2.0597382 , this -0.08422062 +-1.9358038 In this -0.048957378 +-1.856033 in this -0.099168696 +-2.4482787 and this -0.048957378 +-1.9986331 of this -0.048957378 +-2.6253626 to this -0.048957378 +-1.7144383 For this -0.27955192 +-2.3825915 from this -0.048957378 +-2.5046253 on this -0.048957378 +-1.9938124 then this -0.048957378 +-1.5521976 but this -0.048957378 +-1.5063467 consider this -0.048957378 +-1.9040649 at this -0.048957378 +-1.9318298 doing this -0.048957378 +-2.2408266 do this -0.13618872 +-1.9063913 believe this -0.048957378 +-1.3712003 during this -0.048957378 +-0.68921393 maintain this -0.048957378 +-1.8533039 support this -0.048957378 +-1.5063467 feel this -0.048957378 +-1.1584141 ease this -0.048957378 +-1.3677793 though this -0.048957378 +-1.2758812 Perhaps this -0.048957378 +-1.6058056 since this -0.048957378 +-1.6058056 why this -0.048957378 +-0.68921393 admit this -0.048957378 +-1.374476 realize this -0.048957378 +-0.9850521 recognize this -0.048957378 +-1.3677793 certainly this -0.048957378 +-0.9850521 Doing this -0.048957378 +-0.68921393 Interrupting this -0.048957378 +-0.68921393 Disrupting this -0.048957378 +-0.9850521 distract this -0.048957378 +-0.68921393 Keeping this -0.048957378 +-2.4409037 this statement -0.38603142 +-2.85306 the statement -0.11268411 +-2.3608074 The statement -0.048957378 +-1.294531 above statement -0.048957378 +-2.5278392 for -0.048957378 +-1.0219545 statement for -0.13332285 +-1.0779753 reasons for -0.1722646 +-2.2203712 that for -0.048957378 +-1.5585953 job for -0.048957378 +-1.9413155 is for -0.04895735 +-0.17291874 preparation for -0.048957378 +-1.5266267 full-time for -0.048957378 +-2.0927892 will for -0.048957378 +-1.7022598 , for -0.19825454 +-2.0568473 not for -0.048957378 +-2.114417 and for -0.048957378 +-2.3022451 to for -0.048957378 +-1.944184 study for -0.048957378 +-1.5204926 studying for -0.048957378 +-1.7073846 experience for -0.048957378 +-1.3835486 working for -0.048957378 +-0.6986842 reason for -0.048957378 +-1.6065047 time for -0.08422062 +-1.9452095 school for -0.048957378 +-2.1568348 work for -0.048957378 +-1.774954 -LRB- for -0.048957378 +-1.8295575 but for -0.048957378 +-1.5234151 only for -0.04895735 +-0.7025396 necessary for -0.048957378 +-0.598935 pay for -0.07613316 +-1.4443347 fees for -0.048957378 +-1.2561064 money for -0.048957378 +-1.6203611 And for -0.048957378 +-1.0219545 bad for -0.187397 +-1.1141013 market for -0.048957378 +-1.4229276 does for -0.048957378 +-1.7952361 even for -0.048957378 +-0.572456 foundation for -0.048957378 +-0.95634013 history for -0.048957378 +-0.95634013 candidate for -0.048957378 +-1.3513856 better for -0.048957378 +-2.0051796 skills for -0.048957378 +-0.69287807 important for -0.6495775 +-1.4366772 issue for -0.048957378 +-1.7528588 enough for -0.048957378 +-1.3670125 need for -0.048957378 +-1.3944039 good for -0.11268411 +-1.1182201 idea for -0.04895735 +-1.8215423 parents for -0.048957378 +-1.4584918 true for -0.048957378 +-1.9731671 do for -0.048957378 +-1.3862062 chance for -0.048957378 +-0.5652439 opportunity for -0.048957378 +-1.0617472 them for -0.13332285 +-1.5087376 useful for -0.048957378 +-0.97337675 done for -0.048957378 +-1.2296209 fit for -0.048957378 +-1.9170562 get for -0.048957378 +-0.6744481 unprepared for -0.048957378 +-1.6189606 responsibilities for -0.048957378 +-1.3862062 used for -0.048957378 +-1.3643708 late for -0.048957378 +-1.5915792 difficult for -0.048957378 +-1.2214524 cash for -0.048957378 +-1.6321046 income for -0.048957378 +-1.6691784 way for -0.048957378 +-0.6640692 responsible for -0.04895735 +-1.2214524 arguments for -0.048957378 +-1.1141013 '' for -0.048957378 +-0.97991747 value for -0.048957378 +-0.6744481 passion for -0.048957378 +-1.4584918 great for -0.048957378 +-0.95634013 stand for -0.048957378 +-1.1141013 contacts for -0.048957378 +-1.4584918 best for -0.048957378 +-1.1141013 matter for -0.048957378 +-1.2214524 him for -0.048957378 +-1.7452198 hard for -0.048957378 +-1.4229276 Working for -0.048957378 +-1.1141013 material for -0.048957378 +-1.2214524 concern for -0.048957378 +-1.3014593 hold for -0.048957378 +-0.6744481 traverse for -0.048957378 +-1.3537269 things for -0.048957378 +-0.6744481 paying for -0.048957378 +-1.3014593 rent for -0.048957378 +-1.4584918 made for -0.048957378 +-0.95634013 ready for -0.048957378 +-1.6564035 day for -0.048957378 +-0.95634013 lost for -0.048957378 +-1.198989 responsibility for -0.04895735 +-0.6744481 identity for -0.048957378 +-0.6744481 handle for -0.048957378 +-1.5018239 independence for -0.048957378 +-1.2214524 positive for -0.048957378 +-1.1141013 common for -0.048957378 +-0.95634013 applications for -0.048957378 +-0.69226825 possibilities for -0.048957378 +-0.6744481 aptitude for -0.048957378 +-1.1141013 basis for -0.048957378 +-1.3643708 around for -0.048957378 +-0.7841116 essential for -0.048957378 +-1.1141013 budget for -0.048957378 +-1.1141013 distraction for -0.048957378 +-0.95634013 conditions for -0.048957378 +-0.6744481 excuses for -0.048957378 +-1.1942735 right for -0.048957378 +-1.4366772 save for -0.048957378 +-0.69226825 respect for -0.048957378 +-1.3014593 policy for -0.048957378 +-0.95634013 mainly for -0.048957378 +-1.1141013 prepare for -0.187397 +-0.6744481 prepared for -0.048957378 +-1.1141013 critical for -0.048957378 +-0.6744481 empathy for -0.048957378 +-0.786483 required for -0.048957378 +-0.6744481 replacement for -0.048957378 +-0.6744481 struggle for -0.048957378 +-0.7841116 look for -0.04895735 +-0.6744481 looking for -0.048957378 +-1.1141013 expense for -0.048957378 +-1.1141013 loan for -0.048957378 +-1.3093133 materials for -0.048957378 +-1.1141013 low for -0.048957378 +-1.1226108 rewarding for -0.048957378 +-1.2214524 wanted for -0.048957378 +-0.6744481 options for -0.048957378 +-0.6744481 grandparents for -0.048957378 +-0.6744481 exchange for -0.048957378 +-0.6744481 arrange for -0.048957378 +-0.572456 paid for -0.048957378 +-0.6744481 Searching for -0.048957378 +-0.6744481 begging for -0.048957378 +-0.6744481 traumatizing for -0.048957378 +-0.6744481 bond for -0.048957378 +-0.6744481 dogs for -0.048957378 +-0.6744481 ideal for -0.048957378 +-0.6744481 disparate for -0.048957378 +-0.6744481 applying for -0.048957378 +-0.6744481 chances for -0.048957378 +-0.6744481 charge for -0.048957378 +-0.6744481 tool for -0.048957378 +-0.95634013 finishing for -0.048957378 +-0.6744481 carrying for -0.048957378 +-0.6744481 availability for -0.048957378 +-1.789221 smoke for -0.048957378 +-0.6744481 room for -0.048957378 +-0.95634013 justification for -0.048957378 +-0.95634013 smoky for -0.048957378 +-0.6744481 treatment for -0.048957378 +-1.9351243 for several -0.40449065 +-2.7689722 be several -0.048957378 +-3.2900462 , several -0.048957378 +-1.7594332 For several -0.048957378 +-2.3036728 are several -0.048957378 +-1.6385572 offer several -0.048957378 +-2.353659 do several -0.048957378 +-1.5323278 waste several -0.048957378 +-1.8572801 had several -0.048957378 +-0.77503145 several reasons -0.25144583 +-2.7172563 the reasons -0.048957378 +-2.3028388 many reasons -0.048957378 +-2.6201468 of reasons -0.13618872 +-2.2887797 The reasons -0.048957378 +-1.9765648 financial reasons -0.048957378 +-1.6304226 following reasons -0.13618872 +-1.8151026 more reasons -0.048957378 +-0.3378226 two reasons -0.12260215 +-1.0582707 main reasons -0.048957378 +-2.1772497 these reasons -0.2403142 +-1.3826258 various reasons -0.048957378 +-0.475587 three reasons -0.099168696 +-2.3262591 my reasons -0.048957378 +-1.2878569 above reasons -0.187397 +-0.9911411 excellent reasons -0.048957378 +-2.0520139 . -0.048957378 +-1.1706125 disagree . -0.048957378 +-1.2696567 this . -0.50140065 +-1.3076993 statement . -0.40449065 +-1.6404672 for . -0.27955192 +-0.7386445 reasons . -0.9485587 +-1.0515057 it . -0.8024306 +-1.8675882 part-time . -0.048957378 +-1.0911365 job . -1.3934952 +-1.7278584 is . -0.27955192 +-1.2126651 student . -0.70552063 +-0.45037383 graduation . -0.75667316 +-1.619139 not . -0.048957378 +-0.8031024 case . -0.27955192 +-0.5351286 employment . -0.70552063 +-1.7663496 in . -0.187397 +-0.6888471 field . -0.50140065 +-1.8240277 to . -0.048957378 +-0.9988124 study . -0.9485587 +-0.79016376 studying . -0.6475287 +-1.0946081 engineering . -0.048957378 +-0.81913006 experience . -0.8024306 +-1.6405205 from . -0.048957378 +-1.0953617 working . -0.5805819 +-1.0626299 restaurant . -0.27955192 +-1.6075702 there . -0.048957378 +-1.1786642 students . -1.0065507 +-0.8031024 stress . -0.187397 +-1.6347525 on . -0.187397 +-0.65351653 studies . -0.9485587 +-1.1859628 time . -0.8024306 +-1.3356895 jobs . -1.12465 +-1.5887046 one . -0.048957378 +-1.1706125 tired . -0.048957378 +-1.416219 focus . -0.048957378 +-0.63527 school . -0.9785219 +-1.0798451 work . -1.0065507 +-0.6345344 produced . -0.048957378 +-1.6440349 such . -0.048957378 +-0.950476 -RRB- . -0.50140065 +-1.1966463 pursuing . -0.048957378 +-1.7191901 should . -0.048957378 +-1.416219 necessary . -0.048957378 +-1.0604573 college . -1.0811843 +-1.6987075 have . -0.048957378 +-0.8696997 money . -1.0577031 +-1.2445956 say . -0.048957378 +-1.4950117 you . -0.048957378 +-1.5677115 some . -0.048957378 +-1.3076993 bad . -0.187397 +-1.2445956 begin . -0.048957378 +-1.2723796 today . -0.048957378 +-0.9397725 degree . -0.187397 +-0.8260182 well . -0.70552063 +-1.6063483 are . -0.048957378 +-0.52723414 career . -0.5805819 +-0.63132155 experiences . -0.27955192 +-1.2941535 offer . -0.048957378 +-0.9457105 benefit . -0.187397 +-0.83856654 future . -0.70552063 +-1.2458991 consider . -0.048957378 +-0.65543157 skill . -0.187397 +-1.4054539 academic . -0.048957378 +-1.3935354 more . -0.187397 +-1.3123941 employers . -0.048957378 +-1.4223531 better . -0.048957378 +-1.0946081 organization . -0.048957378 +-1.1529151 industry . -0.048957378 +-0.8736102 skills . -0.75667316 +-1.0358967 important . -0.5805819 +-1.0120412 employer . -0.048957378 +-0.58033204 management . -0.27955192 +-1.435324 taking . -0.048957378 +-0.6345344 positively . -0.048957378 +-1.4533774 through . -0.048957378 +-1.1966463 productive . -0.048957378 +-0.9450017 themselves . -0.27955192 +-1.3627458 environment . -0.048957378 +-0.63132155 position . -0.27955192 +-1.0120412 abilities . -0.048957378 +-1.2810172 same . -0.048957378 +-0.9457105 financially . -0.187397 +-0.58033204 issue . -0.27955192 +-1.4707845 enough . -0.048957378 +-1.3622978 food . -0.048957378 +-0.73789895 times . -0.187397 +-1.2557908 need . -0.187397 +-0.6345344 emergency . -0.048957378 +-0.73789895 earned . -0.187397 +-1.433393 good . -0.048957378 +-1.4532349 idea . -0.187397 +-0.9517308 others . -0.187397 +-0.85823375 like . -0.27955192 +-1.2784038 parents . -0.50140065 +-1.0731736 friends . -0.187397 +-0.8628125 relationships . -0.187397 +-1.1790495 people . -0.50140065 +-1.2583388 true . -0.048957378 +-1.3076993 College . -0.048957378 +-0.8829074 stable . -0.048957378 +-1.6067777 do . -0.048957378 +-1.3633446 something . -0.048957378 +-0.8829074 advantage . -0.048957378 +-1.4549133 high . -0.048957378 +-1.4432774 classes . -0.048957378 +-1.3398672 know . -0.048957378 +-0.24155144 ft. . -0.27955192 +-1.5891082 all . -0.048957378 +-1.0819532 them . -0.5805819 +-0.6345344 futures . -0.048957378 +-0.61219954 society . -0.6475287 +-0.8829074 specialized . -0.048957378 +-0.6345344 contexts . -0.048957378 +-0.8829074 specialty . -0.048957378 +-1.1529151 itself . -0.048957378 +-0.6345344 specialization . -0.048957378 +-0.8829074 fields . -0.048957378 +-0.3848129 temptation . -0.187397 +-1.2157034 activities . -0.187397 +-0.6345344 socialize . -0.048957378 +-0.6345344 results . -0.048957378 +-0.8829074 teachers . -0.048957378 +-1.0946081 demands . -0.048957378 +-0.74016064 life . -1.0577031 +-0.8829074 workforce . -0.187397 +-1.0626299 responsibilities . -0.27955192 +-1.178503 university . -0.27955192 +-1.5058556 up . -0.048957378 +-1.0946081 sleeping . -0.048957378 +-0.6345344 repercussions . -0.048957378 +-1.4300016 difficult . -0.048957378 +-1.0946081 lifestyle . -0.048957378 +-1.0334567 helpful . -0.187397 +-1.1139877 regard . -0.048957378 +-0.9457105 finances . -0.187397 +-0.8829074 responsibly . -0.048957378 +-1.2129301 cards . -0.048957378 +-1.2129301 loans . -0.048957378 +-0.5454324 etc. . -0.187397 +-0.6345344 peril . -0.048957378 +-1.0781795 income . -0.187397 +-0.6345344 debts . -0.048957378 +-1.2583388 expenses . -0.40449065 +-0.86117655 family . -0.50140065 +-1.2583388 adult . -0.048957378 +-0.7572681 development . -0.187397 +-1.2615209 independent . -0.048957378 +-0.90136683 graduate . -0.40449065 +-1.2941535 professional . -0.048957378 +-0.84055513 customers . -0.27955192 +-1.2994715 responsible . -0.048957378 +-0.6345344 achievement . -0.187397 +-0.8829074 necessity . -0.048957378 +-1.284379 sense . -0.048957378 +-1.2810172 force . -0.187397 +-0.6345344 staff . -0.048957378 +-0.6623816 situation . -0.187397 +-1.2994715 year . -0.187397 +-0.5871588 education . -1.0065507 +-0.82075477 world . -0.5805819 +-0.9789165 success . -0.187397 +-1.1966463 entirely . -0.048957378 +-0.8628125 importance . -0.187397 +-1.3941447 own . -0.048957378 +-0.6345344 pockets . -0.048957378 +-1.2445956 transition . -0.048957378 +-0.73789895 campus . -0.27955192 +-0.6345344 cultivated . -0.048957378 +-1.3918109 without . -0.048957378 +-1.4285575 about . -0.048957378 +-0.6345344 product . -0.048957378 +-1.1139877 limited . -0.048957378 +-1.0946081 assignments . -0.048957378 +-1.2308197 either . -0.048957378 +-0.9295806 too . -0.40449065 +-0.8829074 complicated . -0.048957378 +-0.58033204 business . -0.27955192 +-0.8829074 ethic . -0.048957378 +-1.588174 learn . -0.048957378 +-0.86921066 lessons . -0.187397 +-0.6345344 exhausting . -0.048957378 +-0.6345344 straight . -0.048957378 +-1.2308197 quit . -0.048957378 +-0.58033204 beneficial . -0.40449065 +-1.3337915 possible . -0.048957378 +-1.1691996 years . -0.187397 +-1.0120412 medicine . -0.048957378 +-1.3330916 three . -0.048957378 +-1.0120412 lesson . -0.048957378 +-1.3774596 before . -0.048957378 +-0.8829074 organizations . -0.048957378 +-0.6345344 gates . -0.048957378 +-1.2723796 meet . -0.048957378 +-1.3722197 new . -0.048957378 +-1.4377742 against . -0.048957378 +-1.0946081 load . -0.048957378 +-0.65543157 performance . -0.187397 +-1.2750996 books . -0.048957378 +-0.6345344 hindrance . -0.048957378 +-0.97314656 worked . -0.187397 +-0.8829074 workload . -0.048957378 +-1.3486543 needs . -0.187397 +-0.874662 costs . -0.27955192 +-1.2458991 afford . -0.048957378 +-0.5454324 process . -0.187397 +-1.1529151 system . -0.048957378 +-1.0946081 practice . -0.187397 +-0.5205023 adults . -0.40449065 +-0.6345344 counterparts . -0.048957378 +-1.2308197 problem . -0.048957378 +-1.2941535 since . -0.048957378 +-1.0120412 concerned . -0.048957378 +-0.8829074 sophisticated . -0.048957378 +-1.2129301 effect . -0.048957378 +-1.0672075 workplace . -0.40449065 +-1.284379 instead . -0.048957378 +-1.0120412 material . -0.048957378 +-1.0946081 concern . -0.048957378 +-0.8829074 worker . -0.048957378 +-0.5205023 problems . -0.27955192 +-1.2810172 families . -0.048957378 +-0.6345344 began . -0.048957378 +-1.1966463 far . -0.048957378 +-0.6345344 immersed . -0.048957378 +-1.0120412 rest . -0.048957378 +-1.0081104 lives . -0.27955192 +-1.103684 want . -0.187397 +-0.8246384 things . -0.50140065 +-0.6345344 drinks . -0.048957378 +-1.0946081 harder . -0.048957378 +-0.8829074 suffer . -0.048957378 +-0.5454324 fun . -0.187397 +-1.1966463 tasks . -0.048957378 +-0.6345344 fatal . -0.048957378 +-1.3902154 Japan . -0.50140065 +-0.65543157 America . -0.187397 +-0.67880464 freedom . -0.27955192 +-1.4546726 think . -0.048957378 +-1.2458991 anything . -0.048957378 +-1.2941535 why . -0.048957378 +-1.0120412 attend . -0.048957378 +-0.65543157 wisely . -0.27955192 +-0.6345344 commodity . -0.048957378 +-1.421118 hours . -0.048957378 +-0.8829074 5 . -0.048957378 +-0.8087627 days . -0.27955192 +-0.97314656 week . -0.187397 +-1.0946081 easily . -0.048957378 +-0.66825956 day . -0.40449065 +-0.8829074 coworkers . -0.048957378 +-0.6345344 underway . -0.048957378 +-1.4443592 home . -0.048957378 +-1.0995541 responsibility . -0.27955192 +-1.1529151 homework . -0.048957378 +-0.8829074 completed . -0.048957378 +-1.0120412 lunch . -0.048957378 +-1.0120412 mentioned . -0.048957378 +-1.0081104 person . -0.187397 +-0.6345344 twenties . -0.048957378 +-0.6345344 remuneration . -0.048957378 +-1.2941535 independence . -0.048957378 +-1.0120412 easier . -0.048957378 +-1.0120412 ask . -0.048957378 +-0.6345344 Sydney . -0.048957378 +-1.2583388 companies . -0.048957378 +-0.8829074 test . -0.048957378 +-0.8829074 character . -0.048957378 +-0.8829074 thing . -0.048957378 +-0.24155144 economy . -0.50140065 +-0.6345344 attached . -0.048957378 +-1.0120412 common . -0.048957378 +-1.4497367 class . -0.187397 +-0.6345344 faculty . -0.048957378 +-1.0285724 individual . -0.187397 +-0.8829074 coursework . -0.048957378 +-1.3115425 discipline . -0.048957378 +-0.6345344 details . -0.048957378 +-1.0946081 co-workers . -0.048957378 +-0.7177284 age . -0.27955192 +-0.8829074 clear . -0.048957378 +-0.8829074 finance . -0.048957378 +-0.65543157 resources . -0.187397 +-0.6345344 copywriting . -0.048957378 +-0.8829074 office . -0.048957378 +-0.6345344 newcomer . -0.048957378 +-0.6345344 mentoring . -0.048957378 +-0.6345344 career-oriented . -0.048957378 +-0.6345344 branch . -0.048957378 +-0.6345344 beings . -0.048957378 +-0.75553894 opinion . -0.5805819 +-1.2308197 us . -0.048957378 +-0.8829074 under . -0.048957378 +-1.1520863 allowed . -0.048957378 +-0.8829074 conditions . -0.048957378 +-0.6345344 fate . -0.048957378 +-0.8829074 generations . -0.048957378 +-0.6345344 irreversible . -0.048957378 +-0.6345344 nation . -0.048957378 +-0.6345344 production . -0.048957378 +-0.6345344 agendas . -0.048957378 +-0.6345344 accepted . -0.048957378 +-1.39268 right . -0.048957378 +-0.6345344 self-improvement . -0.048957378 +-1.0946081 stage . -0.048957378 +-0.6345344 self-supporting . -0.048957378 +-0.8829074 whole . -0.048957378 +-1.0334567 youth . -0.048957378 +-0.8829074 schooling . -0.048957378 +-0.8829074 purposes . -0.048957378 +-1.0120412 desired . -0.048957378 +-0.8829074 classroom . -0.048957378 +-1.3000531 balance . -0.048957378 +-0.8829074 points . -0.048957378 +-0.8829074 socially . -0.048957378 +-0.6623816 careers . -0.187397 +-0.8829074 cities . -0.048957378 +-0.6345344 ends . -0.048957378 +-0.8829074 stronger . -0.048957378 +-0.6345344 actions . -0.048957378 +-1.1706125 together . -0.048957378 +-1.0334567 lectures . -0.048957378 +-0.6345344 degrees . -0.048957378 +-1.0120412 expensive . -0.048957378 +-1.0120412 busy . -0.048957378 +-0.6345344 professionals . -0.048957378 +-0.6345344 sources . -0.048957378 +-0.8829074 faster . -0.048957378 +-0.8628125 children . -0.187397 +-1.4461821 had . -0.048957378 +-0.6345344 pharmacy . -0.048957378 +-0.8829074 1 . -0.048957378 +-1.0120412 2 . -0.048957378 +-1.0120412 3 . -0.048957378 +-1.0120412 resume . -0.048957378 +-0.8829074 4 . -0.048957378 +-0.8829074 survive . -0.048957378 +-0.8829074 vacation . -0.048957378 +-0.8829074 path . -0.048957378 +-0.6345344 homework\/assignments . -0.048957378 +-1.0946081 using . -0.048957378 +-0.8829074 initiative . -0.048957378 +-1.0120412 customer . -0.187397 +-1.2723796 spending . -0.048957378 +-1.0120412 welcome . -0.048957378 +-1.0120412 research . -0.048957378 +-0.8829074 professor . -0.048957378 +-0.8829074 merit . -0.048957378 +-1.1706125 materials . -0.048957378 +-1.2583388 large . -0.048957378 +-0.8829074 changes . -0.048957378 +-1.2941535 me . -0.048957378 +-1.1529151 goal . -0.048957378 +-0.5454324 rewarding . -0.187397 +-0.8829074 everyday . -0.048957378 +-0.6345344 bill . -0.048957378 +-0.6345344 vacations . -0.048957378 +-1.1529151 employees . -0.048957378 +-0.6345344 multi-task . -0.048957378 +-0.6345344 assignment . -0.048957378 +-1.0120412 difficulties . -0.048957378 +-0.6345344 yourself . -0.048957378 +-0.5454324 entertainment . -0.187397 +-0.6345344 porter . -0.048957378 +-0.5454324 opinions . -0.27955192 +-0.3848129 textbook . -0.187397 +-0.6345344 lifetime . -0.048957378 +-0.6345344 glife . -0.048957378 +-1.0120412 clothes . -0.048957378 +-1.0946081 groups . -0.048957378 +-0.8829074 etiquette . -0.048957378 +-0.6345344 language . -0.048957378 +-1.0946081 weekends . -0.048957378 +-0.6345344 regimen . -0.048957378 +-0.6345344 absurd . -0.048957378 +-1.1966463 habits . -0.048957378 +-1.0120412 online . -0.048957378 +-0.6345344 mentioning . -0.048957378 +-0.6345344 failure . -0.048957378 +-0.3848129 desires . -0.187397 +-0.6345344 track . -0.048957378 +-0.8829074 claim . -0.048957378 +-0.6345344 enhancements . -0.048957378 +-0.3848129 involved . -0.187397 +-0.6345344 state . -0.048957378 +-0.8829074 population . -0.048957378 +-0.8829074 institutes . -0.048957378 +-0.6345344 guardians . -0.187397 +-0.6345344 earners . -0.048957378 +-0.8829074 anyway . -0.048957378 +-0.8829074 negligible . -0.048957378 +-0.6345344 fine . -0.048957378 +-0.6345344 discretion . -0.048957378 +-0.6345344 casework . -0.048957378 +-0.90891194 health . -0.27955192 +-0.6345344 bulb . -0.048957378 +-0.6345344 cherished . -0.048957378 +-1.1139877 ideas . -0.048957378 +-0.6345344 peers . -0.048957378 +-0.8829074 exams . -0.048957378 +-0.8829074 larger . -0.048957378 +-0.6345344 threat . -0.048957378 +-1.0946081 properly . -0.048957378 +-0.8829074 exhaustion . -0.048957378 +-1.0120412 heavily . -0.048957378 +-0.8829074 smaller . -0.048957378 +-0.8829074 nightclubs . -0.048957378 +-0.8829074 Vegas . -0.048957378 +-0.8829074 diploma . -0.048957378 +-0.6345344 drug . -0.048957378 +-0.6345344 prizes . -0.048957378 +-0.6345344 proud . -0.048957378 +-0.8829074 goals . -0.048957378 +-0.6345344 holiday . -0.048957378 +-1.0120412 partying . -0.048957378 +-0.6345344 funds . -0.048957378 +-0.9450017 restaurants . -0.70552063 +-0.6345344 short-sighted . -0.048957378 +-0.6345344 assistants . -0.048957378 +-0.6345344 mastered . -0.048957378 +-0.6345344 barricades . -0.048957378 +-0.6345344 cobblestones . -0.048957378 +-0.6345344 discouraged . -0.048957378 +-0.6345344 States . -0.048957378 +-0.6345344 consuming . -0.048957378 +-0.6345344 existence . -0.048957378 +-1.1529151 eat . -0.048957378 +-0.6345344 sake . -0.048957378 +-0.6345344 analysis . -0.048957378 +-0.8829074 past . -0.048957378 +-0.8829074 worries . -0.048957378 +-0.6345344 falter . -0.048957378 +-0.6345344 schoolwork . -0.048957378 +-0.6345344 overwhelmed . -0.048957378 +-0.6345344 doctor . -0.048957378 +-1.0946081 here . -0.048957378 +-0.6345344 volume . -0.048957378 +-0.6345344 afterwards . -0.048957378 +-0.6345344 pessimism . -0.048957378 +-0.6345344 perspective . -0.048957378 +-0.82472557 smoking . -0.50140065 +-0.6345344 route . -0.048957378 +-0.8554143 smoke . -0.50140065 +-0.8829074 well-being . -0.048957378 +-1.0120412 patrons . -0.048957378 +-0.6345344 establishment . -0.048957378 +-0.6345344 sick . -0.048957378 +-0.8829074 everywhere . -0.048957378 +-0.8829074 justified . -0.048957378 +-0.6345344 citizens . -0.048957378 +-0.65543157 non-smokers . -0.187397 +-0.3848129 altogether . -0.187397 +-0.6345344 stimulant . -0.048957378 +-0.6345344 fair . -0.048957378 +-0.6345344 lifestyles . -0.048957378 +-0.6345344 account . -0.048957378 +-0.6345344 ignored . -0.048957378 +-0.6345344 illnesses . -0.048957378 +-2.569465 While -0.048957378 +-2.7962124 it -0.048957378 +-2.071427 reasons it -0.048957378 +-1.2569702 While it -0.048957378 +-2.3158798 can it -0.048957378 +-1.2936037 that it -0.47329462 +-2.0652742 is it -0.04895735 +-2.2450578 student it -0.048957378 +-1.4679097 , it -0.28936014 +-1.8596834 and it -0.04895735 +-2.5097992 of it -0.048957378 +-2.45246 to it -0.048957378 +-2.2400465 from it -0.048957378 +-2.2520611 working it -0.048957378 +-1.581767 as it -0.20946135 +-1.9890342 there it -0.048957378 +-2.5044706 students it -0.048957378 +-1.1415883 accomplish it -0.048957378 +-1.4145563 leave it -0.048957378 +-1.6620624 if it -0.048957378 +-1.906007 then it -0.048957378 +-1.2236197 but it -0.11268411 +-1.5979834 agree it -0.048957378 +-2.345031 have it -0.048957378 +-2.2691255 money it -0.048957378 +-1.7155147 And it -0.048957378 +-1.4845351 making it -0.048957378 +-2.0601783 some it -0.048957378 +-1.4767239 does it -0.048957378 +-1.9467757 or it -0.187397 +-0.5228487 makes it -0.048957378 +-1.8309565 themselves it -0.048957378 +-1.9515239 into it -0.048957378 +-1.7841237 ft it -0.048957378 +-1.4743481 spend it -0.187397 +-1.1415883 However it -0.048957378 +-1.3856101 doing it -0.048957378 +-2.130193 when it -0.048957378 +-1.7233382 do it -0.048957378 +-1.5723684 earn it -0.048957378 +-1.9517244 while it -0.048957378 +-1.8245562 take it -0.048957378 +-1.6424872 know it -0.048957378 +-1.3429189 what it -0.11268411 +-0.807301 because it -0.1722646 +-2.1785307 them it -0.048957378 +-1.8999634 If it -0.04895735 +-1.3488941 whatever it -0.048957378 +-0.684236 Because it -0.048957378 +-2.0614793 get it -0.048957378 +-1.8513404 how it -0.187397 +-1.8507375 believe it -0.40449065 +-1.4169765 make it -0.048957378 +-1.1415883 Firstly it -0.048957378 +-1.7976577 living it -0.048957378 +-1.5425271 however it -0.048957378 +-1.880918 where it -0.048957378 +-0.9370643 find it -0.13618872 +-0.9364819 feel it -0.048957378 +-1.4725379 So it -0.048957378 +-0.9752691 hinder it -0.048957378 +-1.7597054 getting it -0.048957378 +-1.2614309 try it -0.048957378 +-1.4845351 appreciate it -0.048957378 +-1.3049499 think it -0.45249942 +-0.9752691 knows it -0.048957378 +-1.1415883 yet it -0.048957378 +-0.684236 ... it -0.048957378 +-1.653769 opinion it -0.048957378 +-0.684236 rush it -0.048957378 +-1.2569702 Although it -0.048957378 +-0.684236 deem it -0.048957378 +-0.684236 repay it -0.048957378 +-1.3666724 banning it -0.048957378 +-0.684236 nowadays it -0.048957378 +-2.3334107 I can -0.048957378 +-1.5800502 statement can -0.048957378 +-1.6136544 it can -0.048957378 +-2.0062232 that can -0.13618872 +-1.7916571 job can -0.15886542 +-1.6107941 full-time can -0.048957378 +-1.939055 student can -0.048957378 +-2.539219 , can -0.187397 +-2.161529 and can -0.04895735 +-2.1258276 study can -0.187397 +-1.575828 studying can -0.13618872 +-2.202622 experience can -0.048957378 +-1.8187665 students can -0.048957378 +-2.0941591 time can -0.187397 +-1.553163 jobs can -0.11268411 +-1.3493562 one can -0.04895735 +-2.1068811 school can -0.048957378 +-2.0763001 work can -0.048957378 +-2.2926884 money can -0.048957378 +-1.3953857 they can -0.12882008 +-1.9845569 you can -0.048957378 +-1.5400263 experiences can -0.048957378 +-0.9774241 qualification can -0.048957378 +-1.261102 organization can -0.048957378 +-1.7504075 environment can -0.048957378 +-1.5727148 There can -0.048957378 +-1.9623224 parents can -0.048957378 +-2.1816 people can -0.048957378 +-1.7909907 They can -0.048957378 +-1.9316736 This can -0.04895735 +-1.261102 People can -0.048957378 +-1.2651135 area can -0.048957378 +-1.261102 nor can -0.048957378 +-1.7714944 It can -0.13618872 +-1.4205079 colleges can -0.048957378 +-1.861024 how can -0.048957378 +-2.1909645 life can -0.048957378 +-1.3535233 result can -0.048957378 +-1.0764526 which can -0.048957378 +-1.2541722 family can -0.048957378 +-1.3535233 period can -0.048957378 +-1.576367 professional can -0.048957378 +-1.5293787 someone can -0.048957378 +-1.1447526 matter can -0.048957378 +-1.3535233 back can -0.048957378 +-1.1447526 Jobs can -0.048957378 +-1.6517227 person can -0.048957378 +-0.68533725 self-esteem can -0.048957378 +-1.3535233 drinking can -0.048957378 +-0.9774241 connections can -0.048957378 +-0.9774241 coursework can -0.048957378 +-1.6884791 he can -0.048957378 +-1.6247382 age can -0.048957378 +-1.6417441 we can -0.048957378 +-0.68533725 bucks can -0.048957378 +-0.9774241 damage can -0.048957378 +-0.9774241 professor can -0.048957378 +-1.5293787 large can -0.048957378 +-0.68533725 owners can -0.048957378 +-0.68533725 minds can -0.048957378 +-0.9774241 individuals can -0.048957378 +-0.9774241 relief can -0.048957378 +-0.9774241 bars can -0.048957378 +-1.9155476 smoke can -0.048957378 +-0.68533725 Nothing can -0.048957378 +-0.68533725 Smokers can -0.048957378 +-2.5185547 it be -0.048957378 +-0.77449536 can be -0.07191783 +-2.4174626 student be -0.048957378 +-0.97214806 will be -0.16607006 +-2.6945138 , be -0.048957378 +-1.9621344 often be -0.048957378 +-1.2557942 not be -0.08909472 +-2.6657038 and be -0.048957378 +-1.5385488 to be -0.14854218 +-1.0534738 also be -0.11310794 +-2.5650504 time be -0.048957378 +-0.8185822 may be -0.09038658 +-1.157237 otherwise be -0.048957378 +-0.5178241 should be -0.15640421 +-1.3153728 only be -0.04895735 +-1.7209004 To be -0.048957378 +-1.5104494 even be -0.048957378 +-1.965828 first be -0.048957378 +-1.3192201 ft be -0.048957378 +-0.7334877 could be -0.048957378 +-2.2911224 all be -0.048957378 +-0.7922647 must be -0.099168696 +-1.6854032 n't be -0.048957378 +-1.4465187 might be -0.048957378 +-0.62215215 would be -0.066231176 +-1.5708352 however be -0.048957378 +-1.6111709 never be -0.048957378 +-1.6090814 just be -0.048957378 +-0.6896313 preciously be -0.048957378 +-1.7188827 always be -0.048957378 +-1.5067861 actually be -0.048957378 +-1.5067861 us be -0.048957378 +-1.3697741 certainly be -0.048957378 +-0.6896313 t be -0.048957378 +-0.98587745 \ be -0.048957378 +-2.3119407 be argued -0.187397 +-1.3161981 disagree that -0.048957378 +-1.5195582 statement that -0.048957378 +-2.1029716 for that -0.187397 +-1.977956 reasons that -0.048957378 +-1.933587 it that -0.048957378 +-0.6761991 argued that -0.048957378 +-2.2545807 that that -0.048957378 +-1.4560189 job that -0.11268411 +-1.4581163 is that -0.08422062 +-2.0702202 student that -0.048957378 +-1.8952377 , that -0.15838256 +-2.0896373 not that -0.048957378 +-1.3800348 case that -0.048957378 +-1.7984354 employment that -0.048957378 +-2.3107936 in that -0.048957378 +-1.9064859 and that -0.048957378 +-2.2850392 of that -0.048957378 +-1.3161981 nature that -0.048957378 +-2.170326 to that -0.04895735 +-1.6763852 course that -0.048957378 +-1.9690017 study that -0.048957378 +-1.2867079 experience that -0.11268411 +-1.770291 reason that -0.048957378 +-1.655825 students that -0.09038658 +-1.924023 studies that -0.048957378 +-1.9800833 time that -0.048957378 +-1.5073028 jobs that -0.04895735 +-1.955836 one that -0.048957378 +-1.9677188 school that -0.048957378 +-1.9631096 work that -0.048957378 +-1.6653233 being that -0.048957378 +-2.0027235 if that -0.048957378 +-1.4692521 f that -0.048957378 +-1.9557992 such that -0.048957378 +-1.8512307 but that -0.048957378 +-1.0383663 agree that -0.26892814 +-2.1262746 money that -0.048957378 +-0.5449403 say that -0.04895735 +-1.6360705 And that -0.048957378 +-1.928339 so that -0.048957378 +-0.6761991 advertisements that -0.048957378 +-1.4321607 positions that -0.048957378 +-1.2276541 ways that -0.048957378 +-1.4881768 experiences that -0.048957378 +-1.475821 benefits that -0.048957378 +-1.2351948 skill that -0.048957378 +-1.7570267 better that -0.048957378 +-1.3089265 industry that -0.048957378 +-1.0961084 skills that -0.099168696 +-1.6803458 important that -0.048957378 +-1.6545362 environment that -0.048957378 +-0.6761991 complain that -0.048957378 +-1.7697875 enough that -0.048957378 +-1.609811 food that -0.048957378 +-1.6333432 idea that -0.048957378 +-1.4465711 parents that -0.048957378 +-1.439056 relationships that -0.048957378 +-2.0009093 people that -0.048957378 +-1.4253732 see that -0.048957378 +-1.3089265 show that -0.048957378 +-0.76009446 something that -0.048957378 +-1.1038072 know that -0.187397 +-1.1189373 opportunities that -0.048957378 +-2.0039754 all that -0.048957378 +-1.9398266 get that -0.048957378 +-1.3800348 club that -0.048957378 +-1.7805085 activities that -0.048957378 +-1.4253732 workers that -0.048957378 +-1.2276541 demands that -0.048957378 +-0.516151 believe that -0.19768323 +-1.6987407 life that -0.04895735 +-1.6328945 responsibilities that -0.048957378 +-1.1189373 aware that -0.048957378 +-1.3730136 factor that -0.048957378 +-1.5067358 force that -0.048957378 +-1.8510748 world that -0.048957378 +-0.6761991 consequence that -0.048957378 +-1.8360258 out that -0.048957378 +-1.1189373 energy that -0.048957378 +-1.4692521 someone that -0.048957378 +-1.4321607 feel that -0.048957378 +-1.1189373 subjects that -0.048957378 +-1.3748116 than that -0.048957378 +-0.6761991 imply that -0.048957378 +-1.5130997 just that -0.048957378 +-1.1189373 lesson that -0.048957378 +-0.8401329 understand that -0.048957378 +-0.95969737 power that -0.048957378 +-1.3089265 system that -0.048957378 +-1.3800348 effect that -0.048957378 +-0.6761991 hope that -0.048957378 +-0.6761991 follows that -0.048957378 +-1.3932617 problems that -0.048957378 +-0.6761991 waters that -0.048957378 +-0.8738127 things that -0.11268411 +-1.5392416 Students that -0.048957378 +-0.6523933 think that -0.048957378 +-0.92316115 anything that -0.048957378 +-0.95969737 argue that -0.048957378 +-1.5130997 why that -0.048957378 +-1.3089265 homework that -0.048957378 +-1.1189373 lunch that -0.048957378 +-0.46796864 realize that -0.04895735 +-1.1189373 discover that -0.048957378 +-0.6761991 saying that -0.048957378 +-0.95969737 fact that -0.04895735 +-1.109652 opinion that -0.04895735 +-1.3089265 policy that -0.048957378 +-0.6761991 gains that -0.048957378 +-0.95969737 points that -0.048957378 +-0.6761991 belief that -0.048957378 +-0.95969737 doubt that -0.048957378 +-1.2276541 With that -0.048957378 +-0.6761991 field\/industry that -0.048957378 +-1.1267675 seems that -0.048957378 +-0.95969737 seen that -0.048957378 +-1.3089265 employees that -0.048957378 +-0.6761991 feels that -0.048957378 +-0.6761991 structures that -0.048957378 +-0.6761991 life-lesson that -0.048957378 +-0.6761991 self that -0.048957378 +-0.6761991 recommend that -0.048957378 +-0.6761991 burdens that -0.048957378 +-0.6761991 aspect that -0.048957378 +-1.1267675 said that -0.048957378 +-0.6761991 seemed that -0.048957378 +-0.6761991 media that -0.048957378 +-0.95969737 popular that -0.048957378 +-0.6761991 Assuming that -0.048957378 +-0.95969737 goals that -0.048957378 +-0.6761991 cogitation that -0.048957378 +-0.6761991 assume that -0.048957378 +-0.6761991 heard that -0.048957378 +-2.5958927 with having -0.048957378 +-2.4080775 for having -0.048957378 +-2.3830268 that having -0.187397 +-2.4748268 is having -0.048957378 +-2.022009 , having -0.37807262 +-2.6035779 not having -0.048957378 +-2.341326 by having -0.048957378 +-2.1752987 and having -0.048957378 +-2.0305338 of having -0.2403142 +-3.0623753 to having -0.187397 +-2.8203194 students having -0.048957378 +-2.112976 but having -0.048957378 +-1.9406985 believe having -0.187397 +-1.2867545 Not having -0.048957378 +-1.2760465 without having -0.187397 +-1.5793368 between having -0.048957378 +-1.8934716 think having -0.048957378 +-1.5226038 actually having -0.048957378 +-1.1739521 with a -0.07333779 +-1.0912147 for a -0.07465968 +-2.045371 can a -0.048957378 +-1.2453445 be a -0.066231176 +-1.7364441 that a -0.14912641 +-0.46919647 having a -0.5129156 +-2.022187 job a -0.048957378 +-1.0084472 is a -0.101657696 +-1.9704068 student a -0.048957378 +-1.5820451 , a -0.21179748 +-1.7040949 often a -0.048957378 +-1.6529956 not a -0.04895735 +-1.685472 In a -0.048957378 +-1.184262 cases a -0.048957378 +-1.4065168 by a -0.13618872 +-1.023314 in a -0.12436704 +-1.6733795 and a -0.048957378 +-1.2570825 of a -0.15828148 +-1.695535 to a -0.087055564 +-1.2102299 has a -0.04895735 +-1.9556051 experience a -0.048957378 +-0.8583615 gain a -0.048957378 +-1.3920106 from a -0.048957378 +-1.1591635 working a -0.3269201 +-0.7682663 as a -0.086061224 +-1.8707192 students a -0.04895735 +-1.4323175 Such a -0.187397 +-0.94793266 mean a -0.048957378 +-1.4105668 on a -0.09038658 +-1.6129414 also a -0.048957378 +-1.1020619 accomplish a -0.048957378 +-1.3735784 work a -0.13618872 +-1.2014738 being a -0.048957378 +-1.32343 if a -0.30637136 +-0.91683936 such a -0.048957378 +-1.7202098 then a -0.048957378 +-1.3431742 pursuing a -0.048957378 +-1.6536006 pay a -0.048957378 +-1.0163296 have a -0.45843184 +-1.7795784 you a -0.048957378 +-1.4163891 making a -0.048957378 +-1.4744501 following a -0.048957378 +-1.5516328 To a -0.048957378 +-1.3748144 become a -0.04895735 +-0.94793266 guarantee a -0.048957378 +-1.4618114 or a -0.048957378 +-1.7457005 even a -0.048957378 +-1.514834 are a -0.048957378 +-0.77896816 As a -0.13618872 +-1.4744501 offer a -0.048957378 +-1.8568234 more a -0.048957378 +-0.85208535 whether a -0.048957378 +-1.1663418 taking a -0.048957378 +-1.3119683 through a -0.048957378 +-0.9547594 into a -0.048957378 +-0.9927116 at a -0.0796529 +-1.1020619 Having a -0.43442175 +-1.7741511 need a -0.048957378 +-1.6174822 like a -0.048957378 +-1.328294 doing a -0.048957378 +-1.941914 when a -0.048957378 +-1.4824228 earn a -0.11268411 +-1.2879579 take a -0.13618872 +-1.3687944 chance a -0.048957378 +-1.5365813 know a -0.048957378 +-1.8570228 what a -0.048957378 +-1.2702285 them a -0.11268411 +-0.9871 provide a -0.048957378 +-1.7106837 If a -0.3855526 +-0.63674825 get a -0.23204191 +-1.7201445 believe a -0.048957378 +-1.4558547 entering a -0.048957378 +-1.206096 particularly a -0.048957378 +-1.7799845 up a -0.048957378 +-1.7093724 make a -0.048957378 +-1.521916 still a -0.048957378 +-1.3918545 quite a -0.048957378 +-0.94793266 Maintaining a -0.048957378 +-1.309868 managing a -0.048957378 +-0.94793266 least a -0.048957378 +-1.4323175 been a -0.048957378 +-1.3154595 during a -0.048957378 +-0.9679041 earning a -0.048957378 +-0.86637026 lead a -0.048957378 +-1.1210268 enjoy a -0.048957378 +-1.4357413 once a -0.048957378 +-0.94793266 require a -0.048957378 +-1.466621 force a -0.048957378 +-1.89936 education a -0.048957378 +-1.206096 balancing a -0.048957378 +-0.46515924 finding a -0.13618872 +-0.85208535 provides a -0.048957378 +-1.2830615 improve a -0.048957378 +-0.94793266 Without a -0.048957378 +-1.4404591 place a -0.048957378 +-1.3988943 out a -0.048957378 +-1.7400194 where a -0.048957378 +-1.1020619 giving a -0.048957378 +-1.206152 find a -0.13618872 +-0.94793266 Should a -0.048957378 +-1.206096 requires a -0.048957378 +-1.6571989 support a -0.048957378 +-1.2830615 last a -0.048957378 +-1.1020619 building a -0.048957378 +-1.8593731 learn a -0.048957378 +-0.67003465 pursue a -0.048957378 +-1.6988808 than a -0.048957378 +-1.4404591 enter a -0.048957378 +-0.67003465 secure a -0.048957378 +-1.4404591 meet a -0.048957378 +-1.5036985 worked a -0.048957378 +-1.4253675 costs a -0.048957378 +-0.72692496 getting a -0.11268411 +-1.2830615 hold a -0.048957378 +-1.1633122 hours a -0.187397 +-1.5843508 days a -0.048957378 +-1.1122347 risk a -0.048957378 +-0.5695075 adding a -0.048957378 +-0.94793266 securing a -0.048957378 +-1.1020619 studied a -0.048957378 +-0.9130517 obtain a -0.048957378 +-1.1020619 -- a -0.048957378 +-0.94793266 instils a -0.048957378 +-0.67003465 opens a -0.048957378 +-1.1020619 yet a -0.048957378 +-1.1020619 planning a -0.048957378 +-0.94793266 Taking a -0.048957378 +-1.1020619 developing a -0.048957378 +-1.1020619 sometimes a -0.048957378 +-0.94793266 generations a -0.048957378 +-0.94793266 On a -0.048957378 +-1.4961588 balance a -0.048957378 +-0.2673626 becoming a -0.048957378 +-1.4404591 develop a -0.048957378 +-1.206096 When a -0.048957378 +-1.1020619 receive a -0.048957378 +-1.2171772 had a -0.048957378 +-1.206096 given a -0.048957378 +-0.94793266 nights a -0.048957378 +-0.77896816 within a -0.048957378 +-0.94793266 profit a -0.048957378 +-0.5695075 puts a -0.048957378 +-1.4744501 me a -0.048957378 +-0.94793266 seeing a -0.048957378 +-1.206096 over a -0.048957378 +-0.67003465 experienced a -0.048957378 +-0.688208 creating a -0.048957378 +-0.67003465 gauge a -0.048957378 +-0.67003465 devised a -0.048957378 +-0.67003465 deserves a -0.048957378 +-0.94793266 purchase a -0.048957378 +-0.94793266 demand a -0.048957378 +-0.67003465 seeking a -0.048957378 +-0.94793266 challenge a -0.048957378 +-0.94793266 Doing a -0.048957378 +-0.5695075 obtaining a -0.048957378 +-0.67003465 assets a -0.048957378 +-0.94793266 supporting a -0.048957378 +-0.94793266 creates a -0.048957378 +-0.67003465 yields a -0.048957378 +-0.94793266 Yet a -0.048957378 +-0.67003465 deter a -0.048957378 +-0.67003465 poses a -0.048957378 +-0.5695075 down a -0.048957378 +-0.67003465 landing a -0.048957378 +-0.67003465 train a -0.048957378 +-0.67003465 forgo a -0.048957378 +-0.67003465 known a -0.048957378 +-0.67003465 flip a -0.048957378 +-0.67003465 sustain a -0.048957378 +-0.5695075 holding a -0.187397 +-0.67003465 supply a -0.048957378 +-0.67003465 hold-down a -0.048957378 +-0.67003465 coin a -0.048957378 +-0.67003465 Obliging a -0.048957378 +-0.67003465 Implementing a -0.048957378 +-0.94793266 affect a -0.048957378 +-0.94793266 inhaling a -0.048957378 +-0.67003465 contracting a -0.048957378 +-2.3087664 for part-time -0.048957378 +-1.6438224 several part-time -0.048957378 +-1.7389517 that part-time -0.2403142 +-2.029831 having part-time -0.048957378 +-1.1177863 a part-time -1.3923726 +-2.4884188 the part-time -0.048957378 +-2.2031236 , part-time -0.3864422 +-2.4073236 not part-time -0.048957378 +-2.1750138 many part-time -0.048957378 +-1.1748263 any part-time -0.13618872 +-2.6630845 in part-time -0.048957378 +-2.5596447 and part-time -0.048957378 +-1.9850084 of part-time -0.1722646 +-2.744822 to part-time -0.048957378 +-2.5252903 their part-time -0.048957378 +-1.2943524 working part-time -0.048957378 +-1.5264637 work part-time -0.099168696 +-1.1523616 otherwise part-time -0.048957378 +-2.082084 only part-time -0.048957378 +-1.5243121 have part-time -0.8438233 +-1.7573609 And part-time -0.048957378 +-1.5961238 offer part-time -0.048957378 +-1.6334323 These part-time -0.048957378 +-1.8974323 through part-time -0.048957378 +-1.4365401 first part-time -0.187397 +-1.9796 This part-time -0.048957378 +-2.2110343 do part-time -0.048957378 +-1.8734152 take part-time -0.048957378 +-0.9825856 works part-time -0.048957378 +-1.7599086 A part-time -0.40449065 +-0.9825856 fill part-time -0.048957378 +-1.8478122 own part-time -0.048957378 +-1.7916908 find part-time -0.048957378 +-1.5480624 best part-time -0.048957378 +-1.9903675 much part-time -0.048957378 +-1.6361138 worked part-time -0.048957378 +-1.7971907 getting part-time -0.048957378 +-0.9825856 so-called part-time -0.048957378 +-0.94271684 Working part-time -0.048957378 +-0.687964 average part-time -0.048957378 +-1.2710757 Although part-time -0.048957378 +-1.1523616 appropriate part-time -0.048957378 +-1.8027608 had part-time -0.048957378 +-0.687964 located part-time -0.048957378 +-0.687964 take-up part-time -0.048957378 +-0.9825856 seek part-time -0.048957378 +-0.9825856 lucrative part-time -0.048957378 +-1.3732088 a job -0.15678221 +-0.7087217 part-time job -0.37823045 +-1.6573156 full-time job -0.04895735 +-1.9248476 the job -0.07613316 +-1.5016552 any job -0.048957378 +-2.549091 of job -0.048957378 +-2.9251566 to job -0.048957378 +-2.6717951 their job -0.048957378 +-2.1752765 time job -0.26071677 +-1.670574 available job -0.048957378 +-2.2004402 one job -0.048957378 +-1.4486567 's job -0.048957378 +-1.8783722 career job -0.048957378 +-1.3744649 Part-time job -0.048957378 +-2.2617576 part job -0.048957378 +-0.69060683 selecting job -0.048957378 +-1.6151336 same job -0.048957378 +-1.4500074 first job -0.04895735 +-2.007869 good job -0.048957378 +-1.7943124 your job -0.048957378 +-2.1926434 what job -0.048957378 +-1.7639813 real job -0.048957378 +-2.073198 world job -0.048957378 +-1.3762115 particular job -0.048957378 +-1.7107438 against job -0.048957378 +-1.1618836 simple job -0.048957378 +-1.5132296 her job -0.048957378 +-1.1601064 research job -0.048957378 +-2.5747726 is -0.048957378 +-1.1584741 this is -0.04895735 +-0.7414061 it is -0.33447385 +-1.2771456 that is -0.067588836 +-1.7215374 job is -0.09435647 +-1.4571247 student is -0.04895735 +-1.8540161 , is -0.07333779 +-2.0975692 not is -0.048957378 +-2.1619341 and is -0.048957378 +-1.974936 study is -0.048957378 +-1.0616643 studying is -0.048957378 +-1.7227138 experience is -0.048957378 +-2.107548 working is -0.048957378 +-1.1708717 restaurant is -0.04895735 +-0.53915143 there is -0.16349243 +-1.0263927 reason is -0.12260215 +-2.32355 students is -0.048957378 +-1.9288254 studies is -0.048957378 +-1.2727435 time is -0.048957378 +-1.7532738 jobs is -0.13618872 +-1.9612126 one is -0.048957378 +-1.6072118 school is -0.048957378 +-2.1956942 work is -0.048957378 +-1.4717736 f is -0.048957378 +-1.8563898 but is -0.048957378 +-1.5368943 only is -0.048957378 +-1.6732265 necessary is -0.048957378 +-1.4566386 college is -0.04895735 +-0.9604758 Time is -0.048957378 +-1.56093 money is -0.048957378 +-1.9339943 so is -0.048957378 +-2.0325963 or is -0.048957378 +-0.6766042 option is -0.048957378 +-1.4343196 consider is -0.048957378 +-2.0460365 important is -0.048957378 +-1.9463924 who is -0.048957378 +-1.6584053 environment is -0.048957378 +-1.5094854 There is -0.13618872 +-1.6129107 food is -0.048957378 +-0.9604758 alternative is -0.048957378 +-1.4296668 This is -0.11876416 +-1.0262637 College is -0.15595351 +-1.3750327 major is -0.048957378 +-1.7272403 classes is -0.048957378 +-1.3171583 what is -0.048957378 +-1.7295225 provide is -0.048957378 +-1.8008249 society is -0.048957378 +-1.4782304 done is -0.048957378 +-1.2364912 area is -0.048957378 +-1.3178025 grades is -0.048957378 +-1.6753868 It is -0.13773204 +-1.7849132 activities is -0.048957378 +-1.7890021 how is -0.048957378 +-1.2550827 life is -0.048957378 +-1.4343196 graduates is -0.048957378 +-1.3819261 cards is -0.048957378 +-1.6506981 income is -0.048957378 +-0.7504657 which is -0.04895735 +-1.4717736 expenses is -0.048957378 +-1.6790636 family is -0.048957378 +-1.3178025 period is -0.048957378 +-1.4717736 adult is -0.048957378 +-1.208696 way is -0.048957378 +-1.3750327 factor is -0.048957378 +-0.6963953 situation is -0.048957378 +-1.5733624 education is -0.04895735 +-1.8560715 world is -0.048957378 +-1.554436 success is -0.048957378 +-1.3819261 provides is -0.048957378 +-1.2364912 training is -0.048957378 +-1.4265275 cost is -0.048957378 +-1.229098 That is -0.048957378 +-0.9604758 question is -0.048957378 +-0.9604758 effectiveness is -0.048957378 +-1.4717736 someone is -0.048957378 +-1.9314169 learn is -0.048957378 +-1.820775 much is -0.048957378 +-1.4276518 problem is -0.048957378 +-1.4410915 change is -0.048957378 +-1.1200609 concerned is -0.048957378 +-1.1200609 ethics is -0.048957378 +-0.6766042 Suffering is -0.048957378 +-1.3750327 tasks is -0.048957378 +-1.229098 labor is -0.048957378 +-0.6766042 timing is -0.048957378 +-1.6735891 Japan is -0.048957378 +-1.461853 purpose is -0.048957378 +-0.6766042 burgers is -0.048957378 +-1.7334212 think is -0.048957378 +-1.4717736 made is -0.048957378 +-0.9604758 everyone is -0.048957378 +-1.229098 hour is -0.048957378 +-1.2047175 day is -0.048957378 +-0.9604758 coworkers is -0.048957378 +-1.310668 homework is -0.048957378 +-0.6766042 dinner is -0.048957378 +-0.6766042 contrary is -0.048957378 +-1.3178025 drinking is -0.048957378 +-0.97948325 point is -0.13618872 +-0.7866275 she is -0.048957378 +-1.6002579 opinion is -0.048957378 +-0.6766042 Studying is -0.048957378 +-0.6963953 careers is -0.048957378 +-0.6766042 School is -0.048957378 +-0.6766042 profession is -0.048957378 +-1.3750327 everything is -0.048957378 +-1.1200609 resume is -0.048957378 +-0.6766042 eIt is -0.048957378 +-1.310668 goal is -0.048957378 +-1.229098 schedule is -0.048957378 +-1.1200609 graduating is -0.048957378 +-0.9604758 challenge is -0.048957378 +-0.6766042 cwhich is -0.048957378 +-0.9604758 burger is -0.048957378 +-1.7211461 health is -0.048957378 +-1.310668 mind is -0.048957378 +-0.6766042 academics is -0.048957378 +-0.6766042 Failure is -0.048957378 +-0.6766042 administration is -0.048957378 +-0.6766042 Money is -0.048957378 +-0.6766042 Communication is -0.048957378 +-1.7472156 restaurants is -0.048957378 +-0.6766042 bounty is -0.048957378 +-0.6766042 enlightenment is -0.048957378 +-0.6766042 onus is -0.048957378 +-1.229098 here is -0.048957378 +-1.1926798 smoking is -0.048957378 +-1.1009245 smoke is -0.11268411 +-0.6766042 premise is -0.048957378 +-0.6766042 employee is -0.048957378 +-2.6831064 be valuable -0.048957378 +-2.060176 a valuable -0.048957378 +-2.4851818 is valuable -0.048957378 +-1.2878569 acquire valuable -0.048957378 +-1.7760226 many valuable -0.048957378 +-2.9865563 of valuable -0.048957378 +-1.5712947 very valuable -0.048957378 +-1.8648952 little valuable -0.048957378 +-1.7845614 gain valuable -0.048957378 +-2.8372316 students valuable -0.048957378 +-2.05868 -LRB- valuable -0.048957378 +-2.6631584 have valuable -0.048957378 +-2.3820214 them valuable -0.048957378 +-1.9166003 provide valuable -0.048957378 +-2.0122468 make valuable -0.048957378 +-0.6922843 builds valuable -0.048957378 +-2.045974 valuable preparation -0.048957378 +-3.2102208 and preparation -0.048957378 +-2.08053 good preparation -0.048957378 +-0.9944986 effective preparation -0.048957378 +-1.7123908 a full-time -0.28822482 +-3.1451926 and full-time -0.048957378 +-2.689774 of full-time -0.187397 +-2.1955636 studying full-time -0.048957378 +-2.1073205 even full-time -0.048957378 +-0.9939372 formal full-time -0.048957378 +-2.386663 the -0.048957378 +-0.7790162 with the -0.08969047 +-0.80062336 for the -0.14029449 +-1.1939235 While the -0.048957378 +-1.6728754 be the -0.048957378 +-1.2217203 that the -0.14516217 +-1.9707681 job the -0.048957378 +-1.2893914 is the -0.08630035 +-1.7400146 student the -0.048957378 +-1.1939235 acquire the -0.048957378 +-1.2679067 , the -0.073127195 +-1.6690676 often the -0.048957378 +-1.7859917 not the -0.187397 +-1.6504872 In the -0.048957378 +-0.81935287 by the -0.07333779 +-0.873477 in the -0.11459493 +-1.8004584 field the -0.048957378 +-1.2939773 and the -0.08103798 +-0.8074056 of the -0.11205632 +-1.1268009 to the -0.084008865 +-1.5184982 For the -0.048957378 +-1.1939235 engineering the -0.048957378 +-1.9127749 experience the -0.048957378 +-1.5663689 gain the -0.048957378 +-0.8957754 from the -0.067588836 +-1.6497837 as the -0.048957378 +-2.158488 students the -0.048957378 +-0.70090806 on the -0.09782906 +-1.817933 also the -0.048957378 +-2.050863 time the -0.048957378 +-1.3141984 if the -0.04895735 +-1.6829814 then the -0.048957378 +-1.7415366 but the -0.048957378 +-1.2078255 pay the -0.04895735 +-1.2274283 have the -0.07333779 +-1.5544212 And the -0.048957378 +-1.8230162 some the -0.048957378 +-1.8094766 so the -0.048957378 +-1.5615073 : the -0.048957378 +-1.5261137 To the -0.048957378 +-1.6821742 become the -0.048957378 +-1.723907 or the -0.048957378 +-1.3882462 even the -0.048957378 +-1.6919174 are the -0.04895735 +-1.0030825 offer the -0.048957378 +-0.66645676 lays the -0.048957378 +-1.1939235 carefully the -0.048957378 +-0.90722567 consider the -0.048957378 +-1.4209697 By the -0.048957378 +-1.0186265 makes the -0.048957378 +-1.4301113 helps the -0.048957378 +-0.6073866 understanding the -0.04895735 +-0.9411728 Besides the -0.048957378 +-1.2989424 through the -0.048957378 +-0.7231864 into the -0.04895735 +-0.684914 increase the -0.187397 +-0.7249774 at the -0.1544894 +-0.66645676 Saving the -0.048957378 +-1.4120166 put the -0.048957378 +-1.3733218 see the -0.048957378 +-0.9411728 putting the -0.048957378 +-1.4939502 help the -0.187397 +-1.5922378 when the -0.048957378 +-1.8743185 do the -0.048957378 +-1.2747293 take the -0.048957378 +-1.0453825 use the -0.048957378 +-1.6428417 classes the -0.048957378 +-1.527917 what the -0.048957378 +-0.9411728 conclusion the -0.048957378 +-1.1410816 all the -0.048957378 +-1.7806445 because the -0.048957378 +-1.9110339 them the -0.048957378 +-1.4120166 therefore the -0.048957378 +-1.6729219 If the -0.048957378 +-1.5265831 get the -0.048957378 +-1.4120166 teach the -0.187397 +-1.7165799 how the -0.048957378 +-1.4685501 away the -0.048957378 +-1.0484409 give the -0.187397 +-1.7792575 fs the -0.048957378 +-0.65755606 entering the -0.04895735 +-0.974013 up the -0.048957378 +-1.3351274 make the -0.04895735 +-1.4839096 which the -0.13618872 +-0.66645676 restrict the -0.048957378 +-0.8267064 during the -0.048957378 +-1.0924538 rarely the -0.048957378 +-1.5898497 customers the -0.048957378 +-1.1117713 enjoy the -0.048957378 +-1.3363845 provides the -0.048957378 +-1.2685678 improve the -0.048957378 +-1.4209697 place the -0.048957378 +-1.513063 were the -0.048957378 +-1.328324 where the -0.048957378 +-1.5965765 find the -0.048957378 +-1.3733218 explore the -0.048957378 +-1.590543 without the -0.048957378 +-1.0527046 about the -0.04895735 +-0.39827234 overlook the -0.048957378 +-1.2405237 support the -0.048957378 +-1.0924538 keep the -0.048957378 +-0.5671098 ease the -0.048957378 +-0.92050576 learn the -0.5805819 +-0.91040474 let the -0.048957378 +-1.6635323 than the -0.048957378 +-0.9411728 Despite the -0.048957378 +-0.958523 enter the -0.187397 +-1.2685678 manage the -0.048957378 +-0.5671098 beyond the -0.048957378 +-0.7622886 outside the -0.04895735 +-1.4209697 meet the -0.048957378 +-0.36377788 against the -0.08422062 +-1.1939235 load the -0.048957378 +-1.3733218 until the -0.048957378 +-1.566084 go the -0.048957378 +-1.0150436 needs the -0.048957378 +-0.50294834 understand the -0.30637136 +-0.66645676 maturing the -0.048957378 +-1.322422 lack the -0.048957378 +-1.2685678 allow the -0.048957378 +-1.5184982 always the -0.048957378 +-1.4532696 since the -0.048957378 +-0.91040474 change the -0.048957378 +-1.5663689 workplace the -0.048957378 +-0.958523 between the -0.048957378 +-1.2685678 hold the -0.048957378 +-1.0924538 sample the -0.048957378 +-1.6191686 want the -0.048957378 +-1.0924538 hopefully the -0.048957378 +-0.602899 appreciate the -0.048957378 +-1.1939235 harder the -0.048957378 +-1.4120166 made the -0.048957378 +-0.53931296 reduce the -0.13618872 +-1.0924538 reach the -0.048957378 +-1.4809589 week the -0.048957378 +-1.0924538 cause the -0.048957378 +-1.4120166 both the -0.048957378 +-0.39827234 enrich the -0.048957378 +-1.2685678 ultimately the -0.048957378 +-1.0924538 fulfill the -0.048957378 +-0.66645676 motivate the -0.048957378 +-1.3265694 around the -0.048957378 +-0.66645676 familiarize the -0.048957378 +-0.5671098 choosing the -0.048957378 +-0.9411728 Taking the -0.048957378 +-1.1939235 Even the -0.048957378 +-0.66645676 assured the -0.048957378 +-0.5671098 among the -0.048957378 +-0.9411728 On the -0.13618872 +-1.1939235 Although the -0.048957378 +-0.66645676 attain the -0.048957378 +-1.4727286 balance the -0.048957378 +-1.0924538 Balancing the -0.048957378 +-1.3464266 assist the -0.048957378 +-0.9411728 recognize the -0.048957378 +-0.9411728 experiencing the -0.048957378 +-1.1939235 towards the -0.048957378 +-1.322422 form the -0.048957378 +-1.0924538 receive the -0.048957378 +-1.3265694 everything the -0.048957378 +-1.1939235 With the -0.048957378 +-1.0924538 At the -0.187397 +-0.9411728 Also the -0.048957378 +-1.2788793 within the -0.048957378 +-1.1939235 cover the -0.048957378 +-0.66645676 reducing the -0.048957378 +-1.0924538 buy the -0.048957378 +-0.66645676 reviving the -0.048957378 +-0.9411728 onto the -0.048957378 +-0.66645676 breakdown the -0.048957378 +-1.0924538 leaving the -0.048957378 +-0.66645676 selected the -0.048957378 +-0.66645676 discovering the -0.048957378 +-1.0924538 taught the -0.048957378 +-0.66645676 join the -0.048957378 +-0.66645676 restricts the -0.048957378 +-0.66645676 leaves the -0.048957378 +-0.66645676 defray the -0.048957378 +-0.5671098 probably the -0.048957378 +-0.9411728 namely the -0.048957378 +-1.2685678 mind the -0.048957378 +-0.9411728 miss the -0.048957378 +-0.66645676 lay the -0.048957378 +-0.9411728 versus the -0.048957378 +-0.9411728 win the -0.048957378 +-0.66645676 invest the -0.048957378 +-0.66645676 appreciates the -0.048957378 +-0.66645676 extol the -0.048957378 +-0.66645676 digesting the -0.048957378 +-0.66645676 disturbs the -0.048957378 +-0.66645676 saddling the -0.048957378 +-0.9411728 merely the -0.048957378 +-0.66645676 hamstring the -0.048957378 +-0.66645676 wife the -0.048957378 +-0.9411728 near the -0.048957378 +-0.66645676 running the -0.048957378 +-1.0924538 nonetheless the -0.048957378 +-0.9411728 aside the -0.048957378 +-0.66645676 negates the -0.048957378 +-0.66645676 represent the -0.048957378 +-0.66645676 considering the -0.048957378 +-0.66645676 Inside the -0.048957378 +-0.66645676 affecting the -0.048957378 +-0.66645676 alters the -0.048957378 +-0.66645676 concerning the -0.048957378 +-0.66645676 promote the -0.048957378 +-2.324456 this student -0.048957378 +-1.2389748 a student -0.27635497 +-1.3130449 the student -0.17955013 +-2.7663906 , student -0.048957378 +-1.504767 any student -0.048957378 +-1.2834638 engineering student -0.048957378 +-2.2466435 The student -0.048957378 +-1.2424054 college student -0.08422062 +-1.4531673 productive student -0.048957378 +-1.6198496 same student -0.048957378 +-1.7979015 A student -0.048957378 +-0.988917 contribute student -0.048957378 +-0.69116527 Each student -0.048957378 +-2.2885735 my student -0.048957378 +-1.1617546 rounded student -0.048957378 +-1.7370074 individual student -0.048957378 +-0.988917 Every student -0.187397 +-0.988917 modern student -0.048957378 +-0.69116527 sponsored student -0.048957378 +-0.69116527 cloistered student -0.048957378 +-0.69116527 first-year student -0.048957378 +-0.69116527 questing student -0.048957378 +-0.69116527 high-school student -0.048957378 +-0.69116527 strapped student -0.048957378 +-2.039639 I will -0.048957378 +-1.790933 this will -0.048957378 +-2.421054 it will -0.048957378 +-1.4940202 that will -0.04895735 +-2.0680466 job will -0.099168696 +-1.6878295 student will -0.048957378 +-2.2916522 , will -0.048957378 +-2.1507034 many will -0.048957378 +-2.360655 and will -0.048957378 +-1.7814395 course will -0.048957378 +-2.2394748 experience will -0.048957378 +-1.5105842 there will -0.048957378 +-1.7031771 students will -0.0796529 +-2.0725138 studies will -0.048957378 +-2.4674735 time will -0.048957378 +-1.3731819 jobs will -0.048957378 +-1.7634605 focus will -0.048957378 +-2.1001005 work will -0.048957378 +-1.8029361 college will -0.048957378 +-2.3306186 money will -0.048957378 +-1.1343398 they will -0.131905 +-2.0133057 you will -0.048957378 +-1.8386039 career will -0.048957378 +-1.0903299 employers will -0.048957378 +-2.204642 skills will -0.048957378 +-1.3572918 developed will -0.048957378 +-1.5500035 position will -0.048957378 +-1.5856681 There will -0.048957378 +-2.2237115 people will -0.048957378 +-1.812616 They will -0.14912641 +-1.4295913 eventually will -0.048957378 +-1.9613218 This will -0.048957378 +-2.113196 what will -0.048957378 +-1.5410851 therefore will -0.048957378 +-0.9806769 produce will -0.048957378 +-1.7921474 It will -0.048957378 +-2.2246866 life will -0.048957378 +-1.7518512 income will -0.048957378 +-2.0692987 which will -0.048957378 +-2.1128712 education will -0.048957378 +-2.0067453 world will -0.048957378 +-1.4929309 universities will -0.048957378 +-1.7312759 workplace will -0.048957378 +-1.5856681 families will -0.048957378 +-1.1495428 Others will -0.048957378 +-1.7283225 days will -0.048957378 +-1.702872 he will -0.048957378 +-0.68699443 correspondence will -0.048957378 +-1.2706966 possibilities will -0.048957378 +-1.1495428 hierarchies will -0.048957378 +-1.6535664 we will -0.048957378 +-0.68699443 rejection will -0.048957378 +-0.68699443 he\/she will -0.048957378 +-1.8652617 restaurants will -0.048957378 +-2.7143507 will acquire -0.048957378 +-2.7556913 not acquire -0.048957378 +-2.90557 to acquire -0.048957378 +-2.6932495 it upon -0.048957378 +-2.6932502 job upon -0.048957378 +-1.2911812 acquire upon -0.048957378 +-1.2911812 organization upon -0.048957378 +-1.586929 therefore upon -0.048957378 +-0.6931254 adjustment upon -0.048957378 +-1.5900028 however upon -0.048957378 +-0.6931254 programmer upon -0.048957378 +-0.6931254 reliance upon -0.048957378 +-0.6931254 relied upon -0.048957378 +-0.43500787 upon graduation -0.15595351 +-2.1187258 their graduation -0.27955192 +-1.6420906 following graduation -0.048957378 +-0.77736413 after graduation -0.13618872 +-1.5341265 her graduation -0.048957378 +-0.9939372 post graduation -0.048957378 +-1.7055048 with , -0.048957378 +-1.4302695 this , -0.048957378 +-1.3219002 statement , -0.048957378 +-1.8033292 for , -0.048957378 +-1.0297863 reasons , -0.5805819 +-1.7238758 it , -0.048957378 +-1.7306608 that , -0.048957378 +-1.6611714 part-time , -0.048957378 +-1.1937976 job , -0.09435647 +-1.7074049 is , -0.04895735 +-1.1503056 student , -0.048957378 +-1.4204406 graduation , -0.048957378 +-1.2198786 often , -0.048957378 +-1.5628576 not , -0.048957378 +-1.224646 case , -0.048957378 +-0.8450349 cases , -0.048957378 +-1.5231128 employment , -0.048957378 +-1.8455613 in , -0.048957378 +-1.6676689 field , -0.048957378 +-1.6018682 and , -0.11268411 +-1.8030611 of , -0.048957378 +-0.74172163 nature , -0.048957378 +-1.8499541 to , -0.048957378 +-1.116157 course , -0.048957378 +-1.1096603 study , -0.048957378 +-0.80762476 example , -0.13418062 +-1.1549463 studying , -0.04895735 +-1.3897023 little , -0.048957378 +-1.6625665 experience , -0.048957378 +-1.1812 working , -0.048957378 +-1.3911287 restaurant , -0.048957378 +-0.6378578 Moreover , -0.04895735 +-1.5063295 reason , -0.048957378 +-1.3411062 students , -0.1038763 +-1.2552154 additional , -0.048957378 +-1.224646 stress , -0.048957378 +-1.7664974 on , -0.048957378 +-0.8861279 studies , -0.048957378 +-1.4158753 also , -0.048957378 +-0.88881266 increased , -0.048957378 +-1.407659 less , -0.048957378 +-1.065039 time , -0.062332742 +-0.6378578 Additionally , -0.048957378 +-1.1263323 jobs , -0.099168696 +-1.4092437 one , -0.048957378 +-0.74172163 tired , -0.048957378 +-1.1071839 focus , -0.048957378 +-1.0753907 school , -0.048957378 +-1.1743641 work , -0.048957378 +-0.88881266 Therefore , -0.08422062 +-1.4006647 such , -0.048957378 +-0.60410196 -RRB- , -0.0796529 +-1.2435713 then , -0.048957378 +-1.2088966 pursuing , -0.048957378 +-0.6378578 i.e. , -0.048957378 +-1.4625019 pay , -0.048957378 +-0.7904807 tuition , -0.04895735 +-0.8789558 fees , -0.048957378 +-0.6378578 Yes , -0.13618872 +-1.2162323 college , -0.07333779 +-1.7230825 have , -0.048957378 +-0.9721074 money , -0.100717194 +-1.8057331 they , -0.048957378 +-1.2552154 say , -0.048957378 +-1.3708935 And , -0.048957378 +-1.5889893 some , -0.048957378 +-1.3838698 so , -0.048957378 +-1.3087261 following , -0.048957378 +-1.2552154 begin , -0.048957378 +-1.516701 well , -0.048957378 +-0.6378578 portals , -0.048957378 +-1.2587082 positions , -0.048957378 +-1.6315289 are , -0.048957378 +-1.4834546 career , -0.048957378 +-1.1042644 internships , -0.048957378 +-1.3124169 experiences , -0.048957378 +-0.6378578 Primarily , -0.048957378 +-1.0080253 future , -0.13618872 +-1.5498835 more , -0.048957378 +-1.4422464 better , -0.048957378 +-0.9496069 skills , -0.04895735 +-1.6931167 important , -0.048957378 +-1.0200102 include , -0.048957378 +-0.38624087 commitment , -0.048957378 +-1.2869743 management , -0.048957378 +-0.88881266 Besides , -0.048957378 +-1.3449663 knowledge , -0.048957378 +-1.040511 personality , -0.048957378 +-1.186661 themselves , -0.187397 +-1.3808234 environment , -0.048957378 +-1.3124169 position , -0.048957378 +-1.7378285 at , -0.048957378 +-1.3801541 two , -0.048957378 +-1.228235 first , -0.187397 +-1.2869743 issue , -0.048957378 +-1.5412201 spend , -0.048957378 +-0.7846184 food , -0.04895735 +-1.1810448 times , -0.048957378 +-1.1810448 earned , -0.048957378 +-1.4539225 good , -0.048957378 +-0.95756686 others , -0.048957378 +-1.5242484 parents , -0.048957378 +-0.8023583 friends , -0.04895735 +-1.0200102 However , -0.09435647 +-1.2738377 relationships , -0.048957378 +-1.0127653 people , -0.04895735 +-1.4543086 This , -0.048957378 +-1.2088966 effort , -0.048957378 +-0.88881266 stable , -0.048957378 +-1.1042644 People , -0.048957378 +-1.5802838 help , -0.048957378 +-0.95182407 earn , -0.048957378 +-1.2088966 major , -0.048957378 +-1.3400627 use , -0.048957378 +-0.7452157 classes , -0.048957378 +-1.3555096 know , -0.048957378 +-1.538178 could , -0.048957378 +-0.88881266 conclusion , -0.13618872 +-1.3069623 all , -0.04895735 +-1.4976496 them , -0.048957378 +-1.2724897 therefore , -0.048957378 +-0.999642 society , -0.04895735 +-1.2861314 done , -0.048957378 +-0.88881266 specialty , -0.048957378 +-0.88881266 produce , -0.048957378 +-1.1639768 itself , -0.048957378 +-1.0200102 subject , -0.048957378 +-1.1810448 grades , -0.048957378 +-1.0200102 menial , -0.048957378 +-0.6378578 specialist , -0.048957378 +-1.2088966 colleges , -0.048957378 +-0.8376474 activities , -0.048957378 +-1.5137782 social , -0.048957378 +-0.88881266 teachers , -0.048957378 +-1.2440883 workers , -0.048957378 +-1.5104126 believe , -0.048957378 +-1.0630832 life , -0.08422062 +-1.0706135 responsibilities , -0.048957378 +-0.9413238 university , -0.04895735 +-1.2977645 up , -0.048957378 +-1.2088966 late , -0.048957378 +-0.38624087 night , -0.048957378 +-1.1042644 sleeping , -0.048957378 +-1.1810448 result , -0.048957378 +-1.1228919 regard , -0.048957378 +-0.5477129 countries , -0.048957378 +-0.74172163 debt , -0.048957378 +-0.80762476 cards , -0.048957378 +-0.80762476 loans , -0.048957378 +-0.6378578 flow , -0.048957378 +-1.0200102 Firstly , -0.09435647 +-1.0200102 myself , -0.048957378 +-1.1042644 pressures , -0.048957378 +-1.2724897 expenses , -0.048957378 +-0.88881266 Often , -0.048957378 +-1.5237681 these , -0.048957378 +-1.4176418 family , -0.048957378 +-0.88881266 savings , -0.048957378 +-1.2724897 adult , -0.048957378 +-0.8676888 independent , -0.048957378 +-0.38624087 Secondly , -0.10298752 +-0.6378578 prospects , -0.048957378 +-1.2088966 factor , -0.048957378 +-1.1810448 ability , -0.048957378 +-1.1012976 customers , -0.048957378 +-1.3124169 responsible , -0.048957378 +-0.6378578 attributes , -0.048957378 +-0.6378578 Thirdly , -0.048957378 +-0.6378578 scale , -0.048957378 +-1.1042644 arguments , -0.048957378 +-0.45134044 however , -0.04895735 +-0.6378578 condition , -0.048957378 +-0.6340687 year , -0.04895735 +-1.1991137 education , -0.09038658 +-1.3016033 world , -0.048957378 +-0.6378578 prioritization , -0.048957378 +-0.6378578 multitasking , -0.048957378 +-0.98559713 success , -0.048957378 +-1.1228919 burden , -0.048957378 +-1.2861314 place , -0.048957378 +-1.535216 out , -0.048957378 +-1.2802602 learning , -0.048957378 +-1.1639768 interests , -0.048957378 +-1.4483567 about , -0.048957378 +-0.88881266 partially , -0.048957378 +-0.6378578 blanket , -0.048957378 +-0.6378578 supplies , -0.048957378 +-1.2552154 impact , -0.048957378 +-1.1042644 assignments , -0.048957378 +-0.88881266 straightforward , -0.048957378 +-1.4405502 support , -0.048957378 +-0.582704 second , -0.048957378 +-0.66509134 addition , -0.048957378 +-0.88881266 store , -0.048957378 +-1.0200102 subjects , -0.048957378 +-1.2895132 lessons , -0.048957378 +-1.0200102 contacts , -0.048957378 +-1.2440883 So , -0.048957378 +-0.88881266 risks , -0.048957378 +-1.1639768 though , -0.048957378 +-0.9624528 possible , -0.048957378 +-1.1785779 years , -0.048957378 +-1.0200102 medicine , -0.048957378 +-1.3449663 three , -0.048957378 +-1.2861314 enter , -0.048957378 +-0.8789558 come , -0.048957378 +-0.6378578 Lastly , -0.048957378 +-1.3901138 new , -0.048957378 +-1.0200102 offers , -0.048957378 +-1.1042644 load , -0.048957378 +-1.2869743 books , -0.048957378 +-0.74172163 now , -0.048957378 +-1.3616601 needs , -0.048957378 +-0.6378578 moreover , -0.048957378 +-0.6378578 Nevertheless , -0.048957378 +-1.524814 hard , -0.048957378 +-0.6378578 resting , -0.048957378 +-0.88881266 solution , -0.048957378 +-0.6378578 Generally , -0.048957378 +-1.2440883 problem , -0.048957378 +-0.6378578 budgeting , -0.048957378 +-1.0200102 ethics , -0.048957378 +-1.4003539 workplace , -0.048957378 +-1.1042644 concern , -0.048957378 +-1.2959398 But , -0.048957378 +-0.88881266 definition , -0.048957378 +-1.2552154 problems , -0.048957378 +-1.2724897 perhaps , -0.048957378 +-1.0200102 Others , -0.048957378 +-0.6378578 older , -0.048957378 +-1.1042644 courses , -0.048957378 +-1.0200102 rest , -0.048957378 +-1.3555096 lives , -0.048957378 +-1.510369 things , -0.048957378 +-1.3156564 Students , -0.048957378 +-1.2088966 tasks , -0.048957378 +-0.8633121 Japan , -0.04895735 +-1.1639768 rent , -0.048957378 +-1.3472437 freedom , -0.048957378 +-1.0200102 service , -0.048957378 +-1.2587082 anything , -0.048957378 +-0.74172163 law , -0.048957378 +-0.6378578 smart , -0.048957378 +-0.6378578 meaningless , -0.048957378 +-0.88881266 knows , -0.048957378 +-0.6450027 hours , -0.048957378 +-1.3893995 days , -0.048957378 +-1.4325385 day , -0.048957378 +-0.88881266 frustration , -0.048957378 +-0.5477129 First , -0.0796529 +-1.4582292 home , -0.048957378 +-1.107339 responsibility , -0.048957378 +-0.6378578 bed , -0.048957378 +-1.0200102 sure , -0.048957378 +-1.1639768 homework , -0.048957378 +-0.6378578 dormitories , -0.048957378 +-0.6378578 breakfast , -0.048957378 +-1.0200102 mentioned , -0.048957378 +-0.9461563 independence , -0.048957378 +-0.88881266 enormous , -0.048957378 +-0.88881266 dating , -0.048957378 +-1.0200102 easier , -0.048957378 +-0.88881266 party , -0.048957378 +-1.0200102 Australia , -0.048957378 +-1.0200102 studied , -0.048957378 +-1.2724897 companies , -0.048957378 +-0.88881266 summary , -0.048957378 +-0.88881266 wisdom , -0.048957378 +-1.4652518 class , -0.048957378 +-1.1228919 concentration , -0.048957378 +-1.040511 company , -0.048957378 +-1.1228919 end , -0.048957378 +-1.3577803 individual , -0.048957378 +-1.3221623 discipline , -0.048957378 +-0.6585086 expectations , -0.048957378 +-0.9959331 age , -0.048957378 +-0.6378578 Second , -0.04895735 +-0.6378578 Usually , -0.048957378 +-0.88881266 turn , -0.048957378 +-0.6378578 secretary , -0.048957378 +-1.0200102 manager , -0.048957378 +-0.88881266 treasurer , -0.048957378 +-0.6378578 instance , -0.048957378 +-1.0200102 budget , -0.048957378 +-0.6340687 hand , -0.048957378 +-0.6378578 instrument , -0.048957378 +-1.0309508 opinion , -0.048957378 +-1.2440883 us , -0.048957378 +-1.1042644 above , -0.048957378 +-0.6378578 hard-work , -0.048957378 +-0.6378578 teamwork , -0.04895735 +-1.1042644 Although , -0.048957378 +-1.1639768 policy , -0.048957378 +-0.88881266 schooling , -0.048957378 +-0.38624087 Thus , -0.04895735 +-1.3156564 balance , -0.048957378 +-0.38624087 argument , -0.048957378 +-1.0200102 schedules , -0.048957378 +-0.6378578 dressed , -0.048957378 +-0.88881266 classmates , -0.048957378 +-0.5477129 lectures , -0.048957378 +-0.88881266 Otherwise , -0.048957378 +-0.88881266 wealthy , -0.048957378 +-0.6378578 politics , -0.048957378 +-1.0200102 expensive , -0.048957378 +-1.2738377 children , -0.048957378 +-1.040511 old , -0.048957378 +-0.6378578 Hence , -0.048957378 +-1.0200102 resume , -0.048957378 +-1.2440883 long , -0.048957378 +-0.88881266 nights , -0.048957378 +-1.1810448 growth , -0.048957378 +-0.88881266 initiative , -0.048957378 +-0.6378578 solving , -0.048957378 +-0.6378578 communication , -0.048957378 +-0.88881266 Also , -0.13618872 +-1.2861314 spending , -0.048957378 +-1.1639768 choice , -0.048957378 +-0.6378578 although , -0.048957378 +-0.6378578 Regardless , -0.048957378 +-1.1042644 thereby , -0.048957378 +-0.6378578 financing , -0.048957378 +-0.74172163 materials , -0.048957378 +-0.6378578 products , -0.048957378 +-1.0200102 low , -0.048957378 +-0.6378578 Furthermore , -0.04895735 +-1.1042644 schedule , -0.048957378 +-0.88881266 head , -0.048957378 +-0.6378578 excels , -0.048957378 +-0.38624087 Finally , -0.11310794 +-0.6378578 necessities , -0.048957378 +-0.6378578 Zealand , -0.187397 +-0.6378578 Ultimately , -0.048957378 +-1.1042644 wanted , -0.048957378 +-0.6378578 Naturally , -0.048957378 +-0.38624087 attendant , -0.048957378 +-0.6378578 clerk , -0.048957378 +-0.6378578 Third , -0.048957378 +-0.88881266 excel , -0.048957378 +-0.88881266 ourselves , -0.048957378 +-1.0200102 clothes , -0.048957378 +-1.1042644 groups , -0.048957378 +-0.88881266 etiquette , -0.048957378 +-0.88881266 teacher , -0.048957378 +-0.6378578 tourism , -0.048957378 +-1.1042644 weekends , -0.048957378 +-0.6378578 challenging , -0.048957378 +-0.6378578 oppressive , -0.048957378 +-0.88881266 exists , -0.048957378 +-1.0200102 thought , -0.048957378 +-0.6378578 apply , -0.048957378 +-1.2088966 habits , -0.048957378 +-0.6378578 television , -0.048957378 +-0.6378578 front , -0.048957378 +-0.88881266 hobbies , -0.048957378 +-0.6378578 interview , -0.048957378 +-0.6378578 Well , -0.048957378 +-0.88881266 handling , -0.048957378 +-0.6378578 accounts , -0.048957378 +-0.6378578 note , -0.048957378 +-0.6378578 Interestingly , -0.048957378 +-0.6378578 demanding , -0.048957378 +-0.6378578 trips , -0.048957378 +-0.6378578 dollars , -0.048957378 +-0.88881266 consequently , -0.048957378 +-0.6378578 accommodation , -0.048957378 +-0.88881266 third , -0.048957378 +-0.6378578 diligence , -0.048957378 +-0.5477129 said , -0.048957378 +-0.6378578 dimension , -0.048957378 +-0.6378578 membership , -0.048957378 +-1.1639768 mind , -0.048957378 +-0.6378578 Personally , -0.048957378 +-0.6378578 starving , -0.048957378 +-0.88881266 exams , -0.048957378 +-0.6378578 clicks , -0.048957378 +-0.6378578 Consequently , -0.048957378 +-0.6378578 networking , -0.048957378 +-0.6378578 crisis , -0.048957378 +-0.88881266 Currently , -0.187397 +-1.0200102 obvious , -0.048957378 +-0.6378578 casino , -0.048957378 +-0.6378578 Recently , -0.048957378 +-0.6378578 skilled , -0.048957378 +-0.6378578 luck , -0.048957378 +-0.88881266 dropped , -0.048957378 +-1.186661 restaurants , -0.048957378 +-0.6378578 demons , -0.048957378 +-0.6378578 soul , -0.048957378 +-0.6378578 drudgery , -0.048957378 +-0.6378578 Again , -0.048957378 +-0.6378578 carrels , -0.048957378 +-0.6378578 examinations , -0.048957378 +-0.6378578 speaking , -0.048957378 +-0.88881266 depth , -0.048957378 +-0.88881266 five , -0.048957378 +-0.88881266 distracted , -0.048957378 +-0.88881266 expression , -0.048957378 +-0.6378578 18 , -0.048957378 +-0.6378578 liked , -0.048957378 +-1.0200102 nonetheless , -0.048957378 +-1.1042644 approach , -0.048957378 +-0.6378578 body , -0.048957378 +-0.6378578 over-exertion , -0.048957378 +-0.6378578 outlook , -0.048957378 +-0.6378578 depression , -0.048957378 +-0.6378578 honest , -0.048957378 +-0.6378578 Indeed , -0.048957378 +-0.6378578 cars , -0.048957378 +-0.6378578 cigarettes , -0.048957378 +-0.6378578 bartender , -0.048957378 +-0.6378578 nations , -0.048957378 +-1.5413005 smoking , -0.048957378 +-0.6378578 extremes , -0.048957378 +-1.0239186 smoke , -0.099168696 +-0.88881266 regardless , -0.048957378 +-0.88881266 separate , -0.048957378 +-1.0200102 patrons , -0.048957378 +-0.6378578 standards , -0.048957378 +-0.6378578 cancer , -0.048957378 +-1.0200102 choices , -0.048957378 +-1.1810448 smokers , -0.048957378 +-0.88881266 poison , -0.048957378 +-0.6378578 addicted , -0.048957378 +-0.6378578 Principally , -0.048957378 +-0.6378578 importantly , -0.048957378 +-0.6378578 long-term , -0.048957378 +-0.6378578 firstly , -0.048957378 +-0.6378578 diners , -0.048957378 +-0.6378578 gradually , -0.048957378 +-2.0966144 can often -0.048957378 +-2.6721168 be often -0.048957378 +-2.1969292 is often -0.04895735 +-3.1325557 , often -0.048957378 +-2.8984869 and often -0.048957378 +-2.6204116 as often -0.048957378 +-2.0560963 students often -0.04895735 +-2.421799 jobs often -0.048957378 +-2.0546458 then often -0.048957378 +-2.71969 they often -0.048957378 +-2.6925163 are often -0.048957378 +-2.3087277 fs often -0.048957378 +-0.99086237 reality often -0.048957378 +-1.1646541 Too often -0.048957378 +-0.6921442 Quite often -0.048957378 +-0.6921442 More often -0.048957378 +-0.6921442 backgrounds often -0.048957378 +-1.4375031 can not -0.14912641 +-2.4309325 job not -0.048957378 +-1.5519754 is not -0.066231176 +-1.5833449 will not -0.048957378 +-2.1166947 , not -0.09038658 +-1.9314291 often not -0.048957378 +-2.2255504 by not -0.048957378 +-1.9852308 and not -0.048957378 +-2.4516437 of not -0.048957378 +-2.7371233 to not -0.048957378 +-2.1797895 study not -0.048957378 +-2.3380847 from not -0.048957378 +-2.6236172 students not -0.048957378 +-2.086172 studies not -0.048957378 +-2.194169 also not -0.048957378 +-2.4959867 time not -0.048957378 +-2.1722586 if not -0.048957378 +-0.94622356 may not -0.20946135 +-2.028133 but not -0.048957378 +-1.1519579 otherwise not -0.048957378 +-1.1939634 should not -0.18840455 +-2.1040967 have not -0.048957378 +-1.4335887 's not -0.048957378 +-0.94248426 does not -0.048957378 +-1.4425223 or not -0.048957378 +-1.1904882 are not -0.04895735 +-1.5498656 By not -0.048957378 +-2.2282803 important not -0.048957378 +-1.4342052 productive not -0.048957378 +-2.2464046 people not -0.048957378 +-0.7087166 do not -0.131905 +-2.0455916 could not -0.048957378 +-1.9505697 must not -0.048957378 +-1.9630787 If not -0.048957378 +-1.4370916 might not -0.048957378 +-2.176277 would not -0.048957378 +-1.1328431 were not -0.048957378 +-2.0875368 learn not -0.048957378 +-1.5470588 best not -0.048957378 +-0.6749218 did not -0.04895735 +-1.7802289 was not -0.048957378 +-1.5950613 why not -0.048957378 +-1.4950674 simply not -0.048957378 +-0.6878254 maybe not -0.048957378 +-1.8014147 had not -0.048957378 +-1.3641227 growth not -0.048957378 +-0.6878254 patterns not -0.048957378 +-1.1549723 probably not -0.048957378 +-2.4409037 this case -0.048957378 +-2.5476494 the case -0.11268411 +-3.3813157 to case -0.048957378 +-2.5811288 from case -0.048957378 +-1.4472958 In -0.21447381 +-3.4931862 , In -0.048957378 +-2.5550597 with many -0.048957378 +-1.9161221 for many -0.11268411 +-2.3614888 that many -0.048957378 +-2.2502701 , many -0.048957378 +-1.9707955 In many -0.099168696 +-2.4624104 in many -0.048957378 +-2.8808706 of many -0.048957378 +-1.7355789 For many -0.13618872 +-2.773208 students many -0.048957378 +-2.580194 on many -0.048957378 +-1.4531673 leave many -0.048957378 +-2.6028354 have many -0.048957378 +-1.1677701 so many -0.048957378 +-2.2407582 are many -0.048957378 +-2.3476179 them many -0.048957378 +-2.271976 fs many -0.048957378 +-1.1617546 Too many -0.048957378 +-0.80113256 too many -0.048957378 +-1.287082 learn many -0.13618872 +-1.6213087 since many -0.048957378 +-1.6198496 But many -0.048957378 +-1.1617546 Australia many -0.048957378 +-0.988917 mention many -0.048957378 +-0.69116527 share many -0.048957378 +-3.2177634 the cases -0.048957378 +-2.3482852 many cases -0.048957378 +-1.6928706 some cases -0.09038658 +-2.1352663 most cases -0.04895735 +-1.1688302 certain cases -0.048957378 +-1.1688302 extreme cases -0.048957378 +-0.69354665 innumerous cases -0.048957378 +-2.1123781 for any -0.04895735 +-2.8053942 that any -0.048957378 +-2.803459 , any -0.048957378 +-2.3369257 by any -0.048957378 +-2.890147 in any -0.048957378 +-2.6012638 of any -0.048957378 +-1.7801085 gain any -0.048957378 +-1.9860204 from any -0.048957378 +-2.6069639 as any -0.048957378 +-2.5371995 or any -0.048957378 +-1.5228355 consider any -0.048957378 +-2.311358 do any -0.048957378 +-1.580486 entering any -0.048957378 +-1.7456791 enjoy any -0.048957378 +-1.27556 without any -0.048957378 +-1.9860116 about any -0.048957378 +-1.7187887 against any -0.048957378 +-0.9903057 under any -0.048957378 +-0.6918643 utilize any -0.048957378 +-2.8775895 for employment -0.048957378 +-1.3830788 part-time employment -0.048957378 +-1.6757944 full-time employment -0.048957378 +-2.9531138 and employment -0.048957378 +-2.6332078 of employment -0.048957378 +-3.1322563 to employment -0.048957378 +-2.2094445 time employment -0.048957378 +-1.384001 Part-time employment -0.048957378 +-1.442479 future employment -0.187397 +-1.4613192 productive employment -0.048957378 +-0.6925645 subsequent employment -0.048957378 +-1.8460811 find employment -0.048957378 +-0.6925645 gainful employment -0.048957378 +-1.2889621 current employment -0.048957378 +-2.011255 employment obtained -0.048957378 +-2.8837602 are obtained -0.048957378 +-0.9947796 qualification obtained -0.048957378 +-2.5938754 that by -0.048957378 +-2.4361453 job by -0.048957378 +-2.597299 is by -0.048957378 +-1.6528413 upon by -0.048957378 +-2.1182897 , by -0.04895735 +-0.58142364 obtained by -0.048957378 +-2.5596447 and by -0.048957378 +-2.6710854 of by -0.048957378 +-2.1829934 study by -0.048957378 +-0.687964 created by -0.048957378 +-2.5009265 time by -0.048957378 +-2.1560862 school by -0.048957378 +-1.9701416 then by -0.048957378 +-1.7141682 tuition by -0.048957378 +-2.0980384 college by -0.048957378 +-2.3543775 money by -0.048957378 +-2.0309987 you by -0.048957378 +-2.0226445 most by -0.048957378 +-2.3457656 or by -0.048957378 +-2.0970786 help by -0.048957378 +-2.2616282 them by -0.048957378 +-2.1310108 get by -0.048957378 +-1.891795 believe by -0.048957378 +-2.014899 up by -0.048957378 +-0.687964 determined by -0.048957378 +-0.9825856 supported by -0.048957378 +-1.434979 entirely by -0.048957378 +-1.4959576 quit by -0.048957378 +-0.9825856 conclude by -0.048957378 +-1.2710757 higher by -0.048957378 +-1.5934066 But by -0.048957378 +-1.5480624 made by -0.048957378 +-0.687964 abide by -0.048957378 +-1.4959576 simply by -0.048957378 +-1.434979 live by -0.048957378 +-0.687964 executed by -0.048957378 +-1.3676088 required by -0.048957378 +-0.687964 funded by -0.048957378 +-0.687964 financed by -0.048957378 +-0.687964 subsidized by -0.048957378 +-0.687964 thin by -0.048957378 +-0.9825856 solely by -0.048957378 +-0.687964 supplemented by -0.048957378 +-1.273987 ideas by -0.048957378 +-0.9825856 offered by -0.187397 +-0.687964 respond by -0.048957378 +-0.687964 dashed by -0.048957378 +-1.8390278 this in -0.048957378 +-1.9280022 be in -0.048957378 +-2.1159875 part-time in -0.048957378 +-1.3386711 job in -0.18543062 +-2.0691535 is in -0.048957378 +-1.6888387 valuable in -0.048957378 +-1.7480721 student in -0.048957378 +-1.836322 , in -0.07613316 +-1.3325815 employment in -0.048957378 +-1.1060647 obtained in -0.048957378 +-1.8053398 field in -0.048957378 +-1.7480818 and in -0.099168696 +-1.8555362 study in -0.048957378 +-1.8215882 studying in -0.048957378 +-1.1443353 experience in -0.1722646 +-1.2458636 working in -0.25144583 +-0.9429153 waiter in -0.048957378 +-1.4098206 students in -0.14477092 +-0.24872011 engage in -0.11268411 +-1.6601812 on in -0.04895735 +-1.8299239 studies in -0.048957378 +-1.8292288 also in -0.048957378 +-0.9429153 increased in -0.048957378 +-1.3137538 time in -0.0796529 +-1.0949244 accomplish in -0.048957378 +-1.1937122 jobs in -0.048957378 +-1.8517996 one in -0.048957378 +-0.9429153 resulting in -0.048957378 +-1.8638899 school in -0.048957378 +-1.4507525 work in -0.106249355 +-0.9429153 found in -0.048957378 +-1.8249315 an in -0.048957378 +-1.6923311 then in -0.048957378 +-1.7509329 but in -0.048957378 +-1.7701445 only in -0.048957378 +-1.604919 necessary in -0.048957378 +-2.1194386 college in -0.048957378 +-2.0078247 money in -0.048957378 +-1.819564 so in -0.048957378 +-1.7332516 or in -0.187397 +-1.0498438 well in -0.13618872 +-0.9429153 appear in -0.048957378 +-0.90872866 positions in -0.048957378 +-1.5887195 are in -0.19828911 +-1.1970468 ways in -0.048957378 +-1.1970468 internships in -0.048957378 +-2.0922387 part in -0.048957378 +-1.2076097 skill in -0.048957378 +-0.6673815 gaps in -0.048957378 +-1.635373 ; in -0.048957378 +-1.8226506 more in -0.048957378 +-1.6648258 better in -0.048957378 +-1.3403851 whether in -0.048957378 +-1.921623 skills in -0.048957378 +-1.9359927 important in -0.048957378 +-1.0949244 gained in -0.048957378 +-0.96023 learned in -0.048957378 +-1.4425187 position in -0.048957378 +-1.4045392 issue in -0.048957378 +-1.7576237 spend in -0.048957378 +-1.7538188 need in -0.048957378 +-1.5965135 like in -0.048957378 +-1.4171894 put in -0.048957378 +-1.4317251 people in -0.04895735 +-0.84837097 especially in -0.048957378 +-1.4985011 help in -0.048957378 +-0.79488015 while in -0.44443417 +-1.3308138 major in -0.048957378 +-1.6305208 high in -0.048957378 +-1.2493957 classes in -0.048957378 +-1.4284225 them in -0.048957378 +-1.4672174 useful in -0.048957378 +-1.718542 society in -0.048957378 +-0.9429153 function in -0.048957378 +-1.4259391 done in -0.048957378 +-0.6857656 fit in -0.048957378 +-1.2722793 itself in -0.048957378 +-1.5314714 get in -0.048957378 +-1.6934384 activities in -0.048957378 +-0.6673815 behave in -0.048957378 +-1.7533286 up in -0.048957378 +-1.1970468 sleeping in -0.048957378 +-1.5033946 still in -0.048957378 +-0.5677302 helpful in -0.04895735 +-0.6673815 exercise in -0.048957378 +-0.90872866 graduates in -0.048957378 +-0.6673815 difficulty in -0.048957378 +-1.8231336 which in -0.048957378 +-1.4171894 been in -0.048957378 +-0.77588034 period in -0.048957378 +-0.9429153 necessity in -0.048957378 +-1.4425187 year in -0.048957378 +-0.9429153 stake in -0.048957378 +-1.4949515 success in -0.048957378 +-0.6857656 training in -0.048957378 +-1.0949244 '' in -0.048957378 +-1.4259391 place in -0.048957378 +-1.3965319 importance in -0.048957378 +-1.6695335 about in -0.048957378 +-1.1970468 factors in -0.048957378 +-0.9429153 effectiveness in -0.048957378 +-0.6673815 falls in -0.048957378 +-1.0949244 keep in -0.048957378 +-1.4171894 someone in -0.048957378 +-1.5202979 start in -0.048957378 +-1.4926577 learn in -0.187397 +-1.0949244 successful in -0.048957378 +-1.378051 So in -0.048957378 +-1.3340962 than in -0.048957378 +-1.6601965 years in -0.048957378 +-1.2076097 performance in -0.048957378 +-1.2823219 now in -0.048957378 +-0.5677302 participation in -0.048957378 +-1.5251193 always in -0.048957378 +-1.3965319 change in -0.048957378 +-0.9429153 quickly in -0.048957378 +-1.3871932 Working in -0.048957378 +-1.0949244 fall in -0.048957378 +-0.6673815 somewhere in -0.048957378 +-0.9429153 worlds in -0.048957378 +-1.7111423 things in -0.048957378 +-1.1970468 harder in -0.048957378 +-1.4834173 freedom in -0.048957378 +-1.4255186 purpose in -0.048957378 +-0.9429153 dishes in -0.048957378 +-0.6673815 pales in -0.048957378 +-1.597286 hours in -0.048957378 +-1.604919 day in -0.048957378 +-1.0949244 mentioned in -0.048957378 +-1.0825288 person in -0.048957378 +-0.77588034 early in -0.048957378 +-1.4586624 independence in -0.048957378 +-1.2823219 clubs in -0.048957378 +-1.2722793 University in -0.048957378 +-0.9429153 knew in -0.048957378 +-0.9632652 point in -0.048957378 +-1.4171894 both in -0.048957378 +-1.0949244 stay in -0.048957378 +-1.0949244 common in -0.048957378 +-1.5325872 individual in -0.048957378 +-0.46394342 interest in -0.04895735 +-1.1060647 confidence in -0.048957378 +-1.0949244 hierarchies in -0.048957378 +-0.9429153 reports in -0.048957378 +-1.2823219 essential in -0.048957378 +-0.5138011 later in -0.11268411 +-1.378051 actually in -0.048957378 +-0.39865378 engaging in -0.048957378 +-1.0949244 distraction in -0.048957378 +-1.3308138 live in -0.048957378 +-0.9429153 element in -0.048957378 +-1.1970468 stage in -0.048957378 +-1.0949244 country in -0.048957378 +-0.6673815 proactive in -0.048957378 +-1.2722793 policy in -0.048957378 +-0.6673815 mistakes in -0.048957378 +-0.9429153 thinking in -0.048957378 +-0.9429153 costly in -0.048957378 +-1.0079782 spent in -0.048957378 +-1.0949244 currently in -0.048957378 +-0.6673815 explain in -0.048957378 +-0.9429153 survive in -0.048957378 +-1.1970468 Being in -0.048957378 +-0.24872011 interested in -0.04895735 +-0.9429153 changes in -0.048957378 +-1.3871932 banned in -0.40449065 +-0.6673815 exist in -0.048957378 +-0.6673815 involvement in -0.048957378 +-0.9429153 excel in -0.048957378 +-1.0949244 taught in -0.048957378 +-0.6673815 indulging in -0.048957378 +-0.6673815 release in -0.048957378 +-1.2076097 efforts in -0.048957378 +-0.6673815 professions in -0.048957378 +-1.0949244 behind in -0.048957378 +-0.6673815 participate in -0.048957378 +-1.0949244 heavily in -0.048957378 +-0.9429153 Currently in -0.048957378 +-0.9429153 establishments in -0.048957378 +-0.6673815 tables in -0.048957378 +-0.9429153 Poker in -0.048957378 +-0.6673815 majoring in -0.048957378 +-1.6622534 restaurants in -0.048957378 +-1.3806164 ban in -0.048957378 +-0.6673815 dancing in -0.048957378 +-0.6673815 awake in -0.048957378 +-0.6673815 excellence in -0.048957378 +-0.6673815 concepts in -0.048957378 +-0.6673815 himself in -0.048957378 +-0.6673815 marks in -0.048957378 +-0.6673815 advances in -0.048957378 +-0.6673815 weekend in -0.048957378 +-0.6673815 progress in -0.048957378 +-1.4326838 smoking in -0.3464987 +-0.9429153 interference in -0.048957378 +-1.2076097 non-smokers in -0.048957378 +-1.2823219 smokers in -0.187397 +-1.0949244 tobacco in -0.048957378 +-0.6673815 sections in -0.048957378 +-2.8356202 that field -0.048957378 +-2.9859018 a field -0.048957378 +-2.7172563 the field -0.048957378 +-1.1660838 unrelated field -0.048957378 +-2.3840353 their field -0.3855526 +-1.5254972 chosen field -0.048957378 +-1.7279062 related field -0.048957378 +-1.5803852 f field -0.048957378 +-2.1163504 -RRB- field -0.048957378 +-2.0103927 's field -0.048957378 +-1.8980317 career field -0.048957378 +-0.9911411 specialized field -0.048957378 +-0.6922843 respective field -0.048957378 +-1.3826258 his\/her field -0.048957378 +-1.1650699 appropriate field -0.048957378 +-1.2878569 current field -0.048957378 +-2.2633333 and -0.048957378 +-1.9816782 for and -0.048957378 +-1.8946977 it and -0.048957378 +-1.4759145 job and -0.048957378 +-1.6009728 valuable and -0.048957378 +-1.5061843 student and -0.04895735 +-1.5086948 graduation and -0.048957378 +-1.0671117 , and -0.08810763 +-1.6433684 employment and -0.048957378 +-1.9973216 to and -0.187397 +-1.3100222 study and -0.04895735 +-1.2000314 studying and -0.048957378 +-1.3919255 experience and -0.048957378 +-1.6717017 working and -0.04895735 +-0.9238834 waiter and -0.048957378 +-1.4989871 restaurant and -0.048957378 +-1.7470617 there and -0.048957378 +-1.5809023 students and -0.04895735 +-1.2977633 stress and -0.048957378 +-1.9333345 on and -0.048957378 +-1.2027888 studies and -0.048957378 +-1.182269 less and -0.187397 +-1.4344673 time and -0.04895735 +-1.7991823 jobs and -0.048957378 +-1.2454212 tired and -0.048957378 +-1.5387714 focus and -0.048957378 +-1.1771188 quality and -0.048957378 +-1.2763004 school and -0.048957378 +-1.116363 work and -0.07333779 +-1.327107 -LRB- and -0.048957378 +-1.4092859 -RRB- and -0.048957378 +-1.5819968 financial and -0.048957378 +-1.0991837 tuition and -0.048957378 +-0.5984839 fees and -0.04895735 +-1.6507337 college and -0.04895735 +-0.99482673 money and -0.1038763 +-1.6569393 you and -0.048957378 +-1.3735396 today and -0.048957378 +-1.068158 market and -0.048957378 +-0.9238834 competitive and -0.048957378 +-1.4020516 degree and -0.048957378 +-1.0318192 well and -0.04895735 +-1.3735396 benefits and -0.048957378 +-1.6287966 future and -0.048957378 +-1.3394336 consider and -0.048957378 +-1.5451434 academic and -0.048957378 +-0.9432952 ; and -0.13618872 +-1.1841874 skills and -0.048957378 +-0.6571864 spirit and -0.048957378 +-1.361187 management and -0.048957378 +-1.5967124 through and -0.048957378 +-0.5608675 personality and -0.048957378 +-1.2480395 themselves and -0.048957378 +-0.6571864 expand and -0.048957378 +-1.6011622 first and -0.048957378 +-1.6113329 enough and -0.048957378 +-1.671942 spend and -0.048957378 +-1.482256 food and -0.048957378 +-1.5931656 good and -0.048957378 +-1.4232762 others and -0.048957378 +-1.1174805 parents and -0.048957378 +-0.6300471 friends and -0.11268411 +-1.068158 again and -0.048957378 +-1.5754372 people and -0.048957378 +-1.4125342 College and -0.048957378 +-1.8186079 when and -0.048957378 +-0.9238834 teaching and -0.048957378 +-0.9238834 theories and -0.048957378 +-1.2186906 classes and -0.048957378 +-1.4125342 useful and -0.048957378 +-1.6411341 society and -0.048957378 +-1.2326137 itself and -0.048957378 +-0.7640433 grades and -0.048957378 +-1.6089702 activities and -0.048957378 +-1.3758036 social and -0.048957378 +-0.6571864 unwilling and -0.048957378 +-1.6772834 fs and -0.048957378 +-1.0950483 life and -0.048957378 +-0.9238834 realities and -0.048957378 +-1.3732433 up and -0.048957378 +-1.2326137 immediate and -0.048957378 +-1.4564956 personal and -0.048957378 +-1.4125342 finances and -0.048957378 +-0.9238834 levels and -0.048957378 +-1.2454212 debt and -0.048957378 +-1.1395026 income and -0.048957378 +-1.068158 aware and -0.048957378 +-1.1634529 pressures and -0.048957378 +-1.6433934 living and -0.048957378 +-1.3627629 expenses and -0.048957378 +-1.5315179 family and -0.048957378 +-0.89640516 independent and -0.048957378 +-1.2857128 factor and -0.048957378 +-1.3735396 graduate and -0.048957378 +-0.98421866 professional and -0.048957378 +-1.2454212 ability and -0.048957378 +-1.1499598 customers and -0.048957378 +-1.4744734 enjoy and -0.048957378 +-0.9238834 flexible and -0.048957378 +-1.3941008 year and -0.048957378 +-0.85212386 education and -0.04895735 +-1.379723 world and -0.048957378 +-1.4056946 value and -0.048957378 +-1.3481085 learning and -0.048957378 +-0.6571864 talents and -0.048957378 +-1.2326137 interests and -0.048957378 +-1.5701921 full and -0.048957378 +-1.1634529 factors and -0.048957378 +-1.068158 energy and -0.048957378 +-1.1634529 assignments and -0.048957378 +-0.6571864 rises and -0.048957378 +-0.9238834 ethic and -0.048957378 +-0.6571864 saving and -0.048957378 +-1.361187 beneficial and -0.048957378 +-1.5842285 years and -0.048957378 +-1.068158 medicine and -0.048957378 +-0.6571864 circles and -0.048957378 +-1.3735396 meet and -0.048957378 +-0.59639573 books and -0.04895735 +-0.6571864 huge and -0.048957378 +-1.512532 go and -0.048957378 +-1.2454212 now and -0.048957378 +-0.99968916 needs and -0.048957378 +-1.5542489 understand and -0.048957378 +-1.1634529 higher and -0.048957378 +-0.6571864 momentum and -0.048957378 +-1.2411933 hard and -0.048957378 +-1.2454212 concentrate and -0.048957378 +-0.9238834 active and -0.048957378 +-1.1634529 government and -0.048957378 +-0.9238834 sophisticated and -0.048957378 +-1.1771188 insight and -0.048957378 +-0.8922683 universities and -0.187397 +-0.9238834 productivity and -0.048957378 +-0.6571864 manner and -0.048957378 +-0.6571864 issues and -0.048957378 +-1.391816 families and -0.048957378 +-1.3735396 between and -0.048957378 +-1.6325222 things and -0.048957378 +-1.2326137 basic and -0.048957378 +-1.2857128 tasks and -0.048957378 +-0.9238834 suffering and -0.048957378 +-1.1771188 America and -0.048957378 +-1.2326137 rent and -0.048957378 +-1.068158 service and -0.048957378 +-1.3394336 anything and -0.048957378 +-1.2454212 law and -0.048957378 +-0.6571864 unchallenging and -0.048957378 +-0.9238834 circle and -0.048957378 +-1.536119 hours and -0.048957378 +-1.4975594 days and -0.048957378 +-1.4262906 week and -0.048957378 +-1.5387714 day and -0.048957378 +-0.39710817 home and -0.04895735 +-1.1543021 responsibility and -0.048957378 +-1.068158 lunch and -0.048957378 +-1.4020516 independence and -0.048957378 +-0.9238834 party and -0.048957378 +-1.2454212 drinking and -0.048957378 +-0.7640433 clubs and -0.048957378 +-1.3627629 companies and -0.048957378 +-0.9238834 character and -0.048957378 +-0.6571864 taxpayers and -0.048957378 +-1.3394336 obtain and -0.048957378 +-0.8791915 class and -0.04895735 +-0.6571864 record and -0.048957378 +-0.67636937 mature and -0.048957378 +-0.4049986 discipline and -0.048957378 +-0.9238834 organized and -0.048957378 +-0.5608675 diligent and -0.048957378 +-1.2700281 interest and -0.048957378 +-0.6571864 dislikes and -0.048957378 +-1.068158 planning and -0.048957378 +-0.6571864 rigors and -0.048957378 +-0.5608675 confidence and -0.048957378 +-0.3944226 guidance and -0.048957378 +-0.6571864 essays and -0.048957378 +-0.6571864 friend and -0.048957378 +-1.4848788 opinion and -0.048957378 +-0.9238834 concentrated and -0.048957378 +-1.068158 distraction and -0.048957378 +-0.6571864 childhood and -0.048957378 +-0.9238834 influence and -0.048957378 +-1.068158 country and -0.048957378 +-1.361187 save and -0.048957378 +-0.6571864 authority and -0.048957378 +-0.9238834 schools and -0.048957378 +-0.6571864 exploitation and -0.048957378 +-0.6571864 absenteeism and -0.048957378 +-0.9238834 classroom and -0.048957378 +-1.4165443 balance and -0.048957378 +-0.3944226 organizational and -0.187397 +-0.6571864 presently and -0.048957378 +-0.6571864 interviews and -0.048957378 +-0.6571864 professionally and -0.048957378 +-0.7640433 together and -0.048957378 +-0.9238834 meetings and -0.048957378 +-0.9238834 costly and -0.048957378 +-1.068158 personally and -0.048957378 +-0.6571864 certified and -0.048957378 +-0.9238834 faster and -0.048957378 +-0.89640516 children and -0.048957378 +-1.068158 community and -0.048957378 +-0.6571864 maturity and -0.048957378 +-0.3944226 wasted and -0.048957378 +-0.6571864 lazy and -0.048957378 +-0.7640433 growth and -0.048957378 +-1.068158 welcome and -0.048957378 +-0.6571864 well-paying and -0.048957378 +-1.068158 loan and -0.048957378 +-0.6571864 fashion and -0.048957378 +-0.6571864 technology and -0.048957378 +-1.068158 low and -0.048957378 +-0.6571864 tuitions and -0.048957378 +-0.6571864 summers and -0.048957378 +-0.6571864 hiring and -0.048957378 +-0.9238834 dedication and -0.048957378 +-0.9238834 private and -0.048957378 +-0.6571864 housing and -0.048957378 +-0.6571864 wings and -0.048957378 +-0.6571864 unsure and -0.048957378 +-0.9238834 boss and -0.048957378 +-0.6571864 quicker and -0.048957378 +-0.6571864 games and -0.048957378 +-1.068158 thought and -0.048957378 +-0.9238834 spoiled and -0.048957378 +-0.6571864 courting and -0.048957378 +-0.6571864 style and -0.048957378 +-0.6571864 adulthood and -0.048957378 +-0.6571864 expenditure and -0.048957378 +-0.6571864 quietly and -0.048957378 +-0.6571864 feet and -0.048957378 +-0.6571864 paycheck and -0.048957378 +-0.6571864 cautious and -0.048957378 +-1.2146475 health and -0.187397 +-0.6571864 exercising and -0.048957378 +-0.6571864 resumes and -0.048957378 +-0.6571864 dormitory and -0.048957378 +-0.9238834 dorm and -0.048957378 +-0.6571864 awareness and -0.048957378 +-0.9238834 dollar and -0.048957378 +-0.6571864 drugs and -0.048957378 +-0.6571864 couples and -0.048957378 +-0.6571864 goods and -0.048957378 +-0.9238834 smaller and -0.048957378 +-0.6571864 tournament and -0.048957378 +-0.6571864 physics and -0.048957378 +-1.068158 partying and -0.048957378 +-1.2480395 restaurants and -0.048957378 +-0.6571864 housework and -0.048957378 +-0.6571864 wrong-headed and -0.048957378 +-0.6571864 dispensed and -0.048957378 +-0.6571864 holy and -0.048957378 +-0.6571864 racism and -0.048957378 +-0.6571864 compromised and -0.048957378 +-0.6571864 detail and -0.048957378 +-0.6571864 confident and -0.048957378 +-0.9238834 past and -0.048957378 +-0.6571864 maximum and -0.048957378 +-0.9238834 distracted and -0.048957378 +-1.1634529 here and -0.048957378 +-0.6571864 compounded and -0.048957378 +-0.6571864 beer and -0.048957378 +-1.3884833 smoking and -0.187397 +-1.6389887 smoke and -0.048957378 +-1.1634529 rights and -0.048957378 +-1.1634529 places and -0.048957378 +-0.9238834 Restaurants and -0.048957378 +-0.6571864 asthma and -0.048957378 +-0.6571864 appetite and -0.048957378 +-0.6571864 warnings and -0.048957378 +-0.6571864 UK and -0.048957378 +-0.6571864 welfare and -0.048957378 +-0.9238834 poison and -0.048957378 +-0.6571864 foods and -0.048957378 +-0.9238834 comfort and -0.048957378 +-0.6571864 pastime and -0.048957378 +-2.2946815 of -0.048957378 +-1.7340876 be of -0.048957378 +-1.6760712 that of -0.048957378 +-1.7562398 job of -0.048957378 +-1.8932697 is of -0.048957378 +-1.8406625 student of -0.048957378 +-2.0840256 , of -0.187397 +-1.1636829 many of -0.40449065 +-1.1567436 cases of -0.048957378 +-1.6536934 any of -0.048957378 +-0.33458915 field of -0.43435284 +-1.8791423 and of -0.048957378 +-1.2545881 nature of -0.048957378 +-1.1764005 course of -0.048957378 +-1.531143 little of -0.048957378 +-1.2445154 experience of -0.099168696 +-0.1062074 amount of -0.12183642 +-1.3082997 stress of -0.048957378 +-0.3955036 top of -0.04895735 +-1.545943 less of -0.048957378 +-1.2724062 one of -0.13618872 +-0.67875934 quality of -0.04895735 +-1.9544039 work of -0.048957378 +-1.7397772 -RRB- of -0.048957378 +-1.505305 And of -0.048957378 +-0.8792782 some of -0.25144583 +-1.7423034 so of -0.048957378 +-1.1456685 most of -0.13618872 +-1.4158788 degree of -0.048957378 +-1.2424442 Most of -0.048957378 +-0.53542435 level of -0.048957378 +-1.8408818 are of -0.048957378 +-1.1718221 ways of -0.048957378 +-1.3863845 benefits of -0.048957378 +-1.4259127 benefit of -0.048957378 +-0.7084557 part of -0.26013914 +-1.088652 foundation of -0.048957378 +-1.736489 more of -0.048957378 +-1.3760816 potential of -0.048957378 +-0.6037655 understanding of -0.1722646 +-1.3718971 management of -0.048957378 +-0.46130836 knowledge of -0.048957378 +-1.5188937 environment of -0.048957378 +-1.1847353 increase of -0.048957378 +-1.6220485 first of -0.048957378 +-1.3718971 issue of -0.048957378 +-1.4816552 Many of -0.048957378 +-1.6301755 enough of -0.048957378 +-1.6997266 need of -0.048957378 +-1.6151721 good of -0.048957378 +-0.3624679 idea of -0.23452654 +-1.374329 parents of -0.187397 +-1.3302187 chance of -0.048957378 +-1.4505702 use of -0.048957378 +-1.7006466 could of -0.048957378 +-1.0748656 opportunities of -0.048957378 +-1.1263425 all of -0.099168696 +-1.3703593 because of -0.048957378 +-0.3955036 range of -0.048957378 +-0.67875934 area of -0.13618872 +-1.0748656 subject of -0.048957378 +-0.3955036 source of -0.048957378 +-1.6291126 activities of -0.048957378 +-0.6597778 unworthy of -0.048957378 +-1.1718221 demands of -0.187397 +-1.1718221 lot of -0.048957378 +-1.6023384 life of -0.048957378 +-0.9286844 realities of -0.048957378 +-1.1255767 responsibilities of -0.048957378 +-0.7670478 result of -0.048957378 +-0.9286844 responsibly of -0.048957378 +-0.9286844 reality of -0.048957378 +-0.9286844 levels of -0.048957378 +-1.0748656 aware of -0.048957378 +-1.3760816 expenses of -0.048957378 +-0.6597778 combination of -0.048957378 +-1.5488801 family of -0.048957378 +-1.2545881 period of -0.048957378 +-0.46241942 development of -0.11268411 +-0.69239575 way of -0.048957378 +-1.3623174 independent of -0.048957378 +-0.3955036 majority of -0.11268411 +-1.2545881 ability of -0.048957378 +-0.24709822 aspects of -0.04895735 +-0.6597778 expect of -0.048957378 +-0.34167972 sense of -0.048957378 +-0.24709822 member of -0.11268411 +-1.4060105 year of -0.048957378 +-0.3955036 type of -0.04895735 +-1.6829226 world of -0.13618872 +-1.4505702 success of -0.048957378 +-0.67875934 burden of -0.048957378 +-0.3955036 concept of -0.13618872 +-0.2081763 cost of -0.25529256 +-0.90030324 importance of -0.099168696 +-0.6208735 out of -0.08422062 +-0.65402234 value of -0.5186737 +-1.2424442 interests of -0.048957378 +-0.9286844 extent of -0.048957378 +-1.3302187 impact of -0.048957378 +-0.6597778 reduction of -0.048957378 +-0.6597778 relevance of -0.048957378 +-0.9286844 determination of -0.048957378 +-0.6597778 particulars of -0.048957378 +-1.0748656 areas of -0.048957378 +-1.351188 Some of -0.048957378 +-0.95479435 those of -0.04895735 +-0.7902148 years of -0.048957378 +-0.3677609 taste of -0.11268411 +-1.3718971 means of -0.048957378 +-0.32393396 outside of -0.04895735 +-1.1718221 load of -0.048957378 +-1.3571255 much of -0.187397 +-1.1847353 performance of -0.048957378 +-0.6597778 weeks of -0.048957378 +-0.60016286 costs of -0.048957378 +-1.0748656 matter of -0.048957378 +-0.2651781 lack of -0.048957378 +-1.088652 process of -0.048957378 +-1.1718221 practice of -0.048957378 +-1.4730846 always of -0.048957378 +-1.1718221 short of -0.048957378 +-1.3403366 problem of -0.048957378 +-1.3082997 effect of -0.048957378 +-1.5226111 workplace of -0.048957378 +-0.34167972 instead of -0.048957378 +-1.1718221 concern of -0.048957378 +-0.6597778 bit of -0.048957378 +-0.6597778 confines of -0.048957378 +-1.0748656 rest of -0.30637136 +-1.4718409 lives of -0.048957378 +-1.0748656 sample of -0.048957378 +-0.6597778 delights of -0.048957378 +-0.9286844 round of -0.048957378 +-1.4316496 Students of -0.048957378 +-1.2424442 rent of -0.048957378 +-1.4450088 freedom of -0.048957378 +-0.40596336 purpose of -0.11268411 +-0.90030324 waste of -0.048957378 +-0.9286844 circle of -0.048957378 +-1.5510006 hours of -0.048957378 +-1.4410069 week of -0.048957378 +-1.5547959 day of -0.048957378 +-0.5626168 risk of -0.04895735 +-0.9286844 frustration of -0.048957378 +-1.088652 First of -0.048957378 +-1.1608341 responsibility of -0.04895735 +-0.3955036 list of -0.048957378 +-0.6597778 satisfaction of -0.048957378 +-1.2424442 University of -0.048957378 +-1.3969378 point of -0.048957378 +-0.6597778 backdrop of -0.048957378 +-0.6597778 lots of -0.048957378 +-0.9286844 applications of -0.048957378 +-0.9286844 background of -0.048957378 +-0.17096433 members of -0.048957378 +-1.1847353 expectations of -0.048957378 +-0.6597778 rules of -0.048957378 +-0.6597778 array of -0.048957378 +-1.1847353 possibilities of -0.048957378 +-0.17096433 kind of -0.048957378 +-0.6597778 dynamics of -0.048957378 +-0.9286844 side of -0.048957378 +-1.0748656 manager of -0.048957378 +-0.3955036 loss of -0.048957378 +-0.6597778 danger of -0.048957378 +-1.0748656 causes of -0.048957378 +-0.6597778 love of -0.048957378 +-0.6597778 destiny of -0.048957378 +-0.9286844 institutions of -0.048957378 +-1.536486 right of -0.048957378 +-0.5626168 youth of -0.187397 +-0.9286844 virtues of -0.048957378 +-0.6597778 abuse of -0.048957378 +-0.6597778 negligence of -0.048957378 +-0.9286844 purposes of -0.048957378 +-0.3955036 sight of -0.187397 +-0.6597778 corners of -0.048957378 +-1.4316496 balance of -0.048957378 +-0.9286844 decision of -0.048957378 +-0.6597778 attainment of -0.048957378 +-0.9286844 mainly of -0.048957378 +-0.9286844 challenges of -0.048957378 +-0.6597778 favor of -0.048957378 +-0.3955036 variety of -0.048957378 +-0.2651781 form of -0.048957378 +-0.6597778 capable of -0.048957378 +-0.3955036 appreciation of -0.187397 +-0.6597778 length of -0.048957378 +-0.9286844 path of -0.048957378 +-0.6597778 content of -0.048957378 +-1.0748656 expense of -0.048957378 +-0.9286844 pressure of -0.048957378 +-0.6597778 percentage of -0.048957378 +-0.6597778 notice of -0.048957378 +-1.2424442 goal of -0.048957378 +-0.3955036 care of -0.04895735 +-0.17096433 number of -0.13618872 +-0.6597778 dime of -0.048957378 +-0.6597778 requirements of -0.048957378 +-0.6597778 failures of -0.048957378 +-0.6597778 undecided of -0.048957378 +-0.6597778 direction of -0.187397 +-1.0748656 kids of -0.048957378 +-0.6597778 possession of -0.048957378 +-0.3955036 thousands of -0.048957378 +-0.6597778 atmosphere of -0.048957378 +-0.6597778 norms of -0.048957378 +-0.6597778 periods of -0.048957378 +-0.6597778 bouts of -0.048957378 +-0.6597778 One of -0.048957378 +-0.67875934 efforts of -0.048957378 +-0.6597778 control of -0.048957378 +-0.6597778 security of -0.048957378 +-0.3955036 thoughts of -0.048957378 +-0.6597778 joys of -0.048957378 +-0.6597778 pain of -0.048957378 +-0.6597778 interpretation of -0.048957378 +-0.6597778 walks of -0.048957378 +-0.6597778 myriad of -0.048957378 +-0.3955036 amounts of -0.048957378 +-0.6597778 terms of -0.04895735 +-0.9286844 beginning of -0.048957378 +-0.6597778 impression of -0.048957378 +-0.6597778 sort of -0.048957378 +-0.6597778 detriment of -0.048957378 +-1.2225406 health of -0.12260215 +-0.6597778 phase of -0.048957378 +-0.67875934 ideas of -0.187397 +-0.6597778 remainder of -0.048957378 +-0.3955036 pursuit of -0.048957378 +-0.6597778 forms of -0.048957378 +-0.6597778 dropout of -0.048957378 +-0.6597778 image of -0.048957378 +-0.6597778 acquisition of -0.048957378 +-0.6597778 dollops of -0.048957378 +-0.9286844 minimum of -0.048957378 +-0.6597778 perversion of -0.048957378 +-0.9286844 attention of -0.048957378 +-0.6597778 behalf of -0.048957378 +-0.9286844 none of -0.048957378 +-0.6597778 foremost of -0.048957378 +-0.6597778 efficacy of -0.048957378 +-0.6597778 plenty of -0.187397 +-0.6597778 entrance of -0.048957378 +-0.6597778 feelings of -0.048957378 +-0.9286844 regardless of -0.048957378 +-0.6597778 concerns of -0.048957378 +-0.6597778 sufferers of -0.048957378 +-0.17096433 effects of -0.048957378 +-0.6597778 hundreds of -0.048957378 +-0.9286844 interference of -0.048957378 +-0.6597778 parts of -0.048957378 +-0.6597778 despite of -0.048957378 +-0.6597778 infringement of -0.048957378 +-0.6597778 smell of -0.048957378 +-0.9286844 comfort of -0.048957378 +-3.2085369 a nature -0.048957378 +-2.4029973 by nature -0.048957378 +-2.1553595 very nature -0.048957378 +-2.3543274 The nature -0.048957378 +-1.5924584 true nature -0.048957378 +-2.6986191 I completely -0.048957378 +-3.0414262 is completely -0.048957378 +-3.0569165 in completely -0.048957378 +-3.1451926 and completely -0.048957378 +-1.5341265 am completely -0.048957378 +-1.8600588 without completely -0.048957378 +-2.7652595 job unrelated -0.048957378 +-1.4704666 completely unrelated -0.048957378 +-2.46057 an unrelated -0.048957378 +-1.0508462 reasons to -0.048957378 +-1.5903312 it to -0.04895735 +-1.6811284 be to -0.04895735 +-0.72180253 having to -0.07333779 +-1.5751932 job to -0.04895735 +-1.4192693 is to -0.04895735 +-1.576197 valuable to -0.048957378 +-1.2686075 student to -0.08422062 +-1.4929917 graduation to -0.048957378 +-1.7255036 , to -0.07613316 +-1.2801445 not to -0.09038658 +-1.2849426 case to -0.048957378 +-1.9902425 in to -0.048957378 +-1.4991642 and to -0.106249355 +-0.5586906 unrelated to -0.048957378 +-1.3251743 chosen to -0.048957378 +-0.8811456 has to -0.048957378 +-1.3806895 experience to -0.048957378 +-1.6520369 working to -0.048957378 +-1.7400358 as to -0.048957378 +-1.601229 reason to -0.187397 +-1.062123 students to -0.32777715 +-1.6869233 also to -0.048957378 +-1.5055274 less to -0.048957378 +-1.0517309 time to -0.09295115 +-0.55355364 available to -0.048957378 +-1.4094291 jobs to -0.13618872 +-1.7271516 one to -0.048957378 +-0.17017452 unable to -0.048957378 +-1.6199232 work to -0.11268411 +-0.1524975 related to -0.32291976 +-1.3972498 -RRB- to -0.048957378 +-1.1961598 only to -0.048957378 +-1.1461537 necessary to -0.04895735 +-0.93347734 have to -0.086061224 +-0.99103224 money to -0.0796529 +-1.1532133 - to -0.048957378 +-0.53202194 begin to -0.04895735 +-1.3809829 or to -0.048957378 +-1.6156511 well to -0.048957378 +-0.91795576 appear to -0.048957378 +-1.7760067 are to -0.048957378 +-1.1532133 ways to -0.048957378 +-0.93576556 benefits to -0.13618872 +-2.0149956 part to -0.048957378 +-0.39307514 order to -0.09038658 +-1.5376222 ; to -0.048957378 +-1.6797292 more to -0.048957378 +-0.65396875 attractive to -0.048957378 +-0.94012946 helps to -0.048957378 +-1.7944751 skills to -0.048957378 +-0.8313116 important to -0.071063936 +-0.06756879 able to -0.07613316 +-1.2373985 themselves to -0.048957378 +-1.576392 first to -0.048957378 +-0.82628065 enough to -0.099168696 +-1.4634788 food to -0.048957378 +-1.0599167 Having to -0.048957378 +-0.5494863 need to -0.07333779 +-1.4076629 others to -0.048957378 +-1.5021309 like to -0.048957378 +-1.3508242 parents to -0.048957378 +-1.7317443 people to -0.048957378 +-1.3131771 see to -0.048957378 +-1.2721952 effort to -0.048957378 +-0.91795576 keeping to -0.048957378 +-0.39307514 willing to -0.048957378 +-0.39307514 trying to -0.048957378 +-0.53202194 chance to -0.11310794 +-1.0187466 use to -0.048957378 +-1.6991231 what to -0.048957378 +-1.74052 all to -0.048957378 +-0.5566332 opportunity to -0.08422062 +-0.7894608 them to -0.048957378 +-1.502666 Japanese to -0.048957378 +-0.9820392 useful to -0.187397 +-0.24584402 expected to -0.048957378 +-1.0599167 Learning to -0.048957378 +-0.65396875 external to -0.048957378 +-1.4645659 get to -0.048957378 +-1.0599167 relevant to -0.048957378 +-1.2721952 colleges to -0.048957378 +-1.5850388 activities to -0.048957378 +-0.91795576 sufficient to -0.048957378 +-0.35869056 how to -0.067588836 +-1.1532133 lot to -0.048957378 +-1.798391 life to -0.048957378 +-0.5586906 prior to -0.048957378 +-1.5777478 university to -0.048957378 +-0.53202194 used to -0.04895735 +-1.122083 up to -0.04895735 +-0.3203675 difficult to -0.048957378 +-1.5698397 make to -0.048957378 +-0.6734003 regard to -0.048957378 +-1.2849426 loans to -0.048957378 +-1.4833115 income to -0.048957378 +-1.706743 which to -0.048957378 +-1.1542845 way to -0.048957378 +-0.34143603 lead to -0.09435647 +-0.65396875 regards to -0.048957378 +-1.2721952 factor to -0.048957378 +-0.76031667 ability to -0.048957378 +-1.5089169 customers to -0.048957378 +-1.3396053 sense to -0.048957378 +-0.91795576 businesses to -0.048957378 +-0.65396875 contrast to -0.048957378 +-1.7588028 education to -0.048957378 +-1.6350622 world to -0.048957378 +-0.18565236 going to -0.12260215 +-0.91795576 ground to -0.048957378 +-1.2721952 entirely to -0.048957378 +-1.0599167 '' to -0.048957378 +-0.93576556 place to -0.048957378 +-1.3375124 importance to -0.048957378 +-1.6405561 out to -0.048957378 +-1.6179485 where to -0.048957378 +-1.3918962 value to -0.048957378 +-1.3094288 transition to -0.048957378 +-1.234228 campus to -0.048957378 +-0.79111594 learning to -0.099168696 +-0.91795576 continue to -0.048957378 +-1.2206305 interests to -0.048957378 +-0.91795576 translate to -0.048957378 +-0.91795576 answer to -0.048957378 +-0.91795576 question to -0.048957378 +-1.1677823 limited to -0.048957378 +-0.65396875 pertaining to -0.048957378 +-0.65396875 relation to -0.048957378 +-0.39307514 relative to -0.187397 +-0.6781727 addition to -0.187397 +-0.91795576 stand to -0.048957378 +-0.6734003 decide to -0.048957378 +-0.54948056 start to -0.048957378 +-0.8109887 learn to -0.048957378 +-0.8960868 lessons to -0.048957378 +-1.0599167 areas to -0.048957378 +-1.0599167 contacts to -0.048957378 +-0.65396875 intends to -0.048957378 +-1.5570242 than to -0.048957378 +-0.9777668 just to -0.048957378 +-0.6959686 possible to -0.27955192 +-1.5624459 years to -0.048957378 +-1.0599167 lesson to -0.048957378 +-0.5963944 come to -0.04895735 +-1.2206305 manage to -0.048957378 +-0.59412885 means to -0.048957378 +-1.5980755 much to -0.048957378 +-0.6240338 go to -0.25144583 +-0.6981963 needs to -0.20946135 +-0.8871219 afford to -0.048957378 +-0.65396875 daughter to -0.048957378 +-0.91795576 herself to -0.048957378 +-1.167622 getting to -0.048957378 +-0.770318 hard to -0.048957378 +-0.76031667 back to -0.048957378 +-0.91795576 power to -0.048957378 +-0.65396875 dedicated to -0.048957378 +-1.1532133 government to -0.048957378 +-0.91795576 comparison to -0.048957378 +-1.3094288 adults to -0.048957378 +-0.39307514 tend to -0.048957378 +-0.76031667 likely to -0.13618872 +-1.3746307 families to -0.048957378 +-0.42979532 want to -0.17298892 +-0.6734003 try to -0.048957378 +-0.65396875 unavailable to -0.048957378 +-1.2849426 takes to -0.048957378 +-1.1532133 harder to -0.048957378 +-0.9900141 freedom to -0.048957378 +-1.1532133 serve to -0.048957378 +-0.65396875 encouraged to -0.048957378 +-1.0599167 fortunate to -0.048957378 +-1.1677823 wisely to -0.048957378 +-1.4085699 week to -0.048957378 +-0.91795576 ready to -0.048957378 +-0.65396875 bound to -0.048957378 +-0.65396875 tempting to -0.048957378 +-0.65396875 urged to -0.048957378 +-0.65396875 child to -0.048957378 +-1.5307803 home to -0.048957378 +-1.1462728 responsibility to -0.048957378 +-0.91795576 seem to -0.048957378 +-1.4384576 person to -0.048957378 +-1.0756072 adding to -0.048957378 +-1.3466693 companies to -0.048957378 +-0.91795576 easy to -0.048957378 +-0.91795576 wisdom to -0.048957378 +-0.65396875 network to -0.048957378 +-0.65396875 needing to -0.048957378 +-0.91795576 references to -0.048957378 +-1.4458827 individual to -0.048957378 +-0.91795576 wants to -0.048957378 +-0.65396875 attentive to -0.048957378 +-1.1677823 resources to -0.048957378 +-1.1532133 h to -0.048957378 +-1.1532133 needed to -0.048957378 +-0.65396875 plan to -0.048957378 +-1.1532133 further to -0.048957378 +-0.65396875 stability to -0.048957378 +-1.3131771 her to -0.048957378 +-0.3657754 allowed to -0.048957378 +-0.8648125 right to -0.04895735 +-0.65396875 obliged to -0.048957378 +-0.65396875 assigned to -0.048957378 +-1.1677823 respect to -0.048957378 +-0.65396875 measures to -0.048957378 +-0.91795576 decision to -0.048957378 +-0.65396875 consequences to -0.048957378 +-0.65396875 closer to -0.048957378 +-0.65396875 compared to -0.048957378 +-0.65396875 credits to -0.048957378 +-1.3375124 children to -0.048957378 +-0.59412885 choose to -0.04895735 +-0.58597535 had to -0.048957378 +-0.76497996 required to -0.048957378 +-0.65396875 surprised to -0.048957378 +-0.65396875 came to -0.048957378 +-0.91795576 bring to -0.048957378 +-1.1532133 purely to -0.048957378 +-1.1532133 given to -0.048957378 +-0.65396875 relates to -0.048957378 +-0.65396875 suited to -0.048957378 +-1.234228 look to -0.048957378 +-1.2206305 choice to -0.048957378 +-0.65396875 assistant to -0.048957378 +-0.65396875 leads to -0.048957378 +-0.91795576 merit to -0.048957378 +-0.65396875 capital to -0.048957378 +-0.9777668 me to -0.048957378 +-0.65396875 task to -0.048957378 +-0.65396875 Looking to -0.048957378 +-0.91795576 goes to -0.048957378 +-0.65396875 adjusting to -0.048957378 +-1.2206305 employees to -0.048957378 +-0.65396875 adjust to -0.048957378 +-0.91795576 utmost to -0.048957378 +-0.65396875 prove to -0.048957378 +-1.0599167 left to -0.048957378 +-1.0599167 finished to -0.048957378 +-0.91795576 ourselves to -0.048957378 +-1.0599167 kids to -0.048957378 +-0.3657754 wish to -0.099168696 +-0.65396875 key to -0.048957378 +-0.65396875 leading to -0.048957378 +-0.65396875 accustomed to -0.048957378 +-0.65396875 shock to -0.048957378 +-0.65396875 deleterious to -0.048957378 +-0.65396875 devotion to -0.048957378 +-0.65396875 disciplines to -0.048957378 +-0.65396875 dependence to -0.048957378 +-0.5586906 begins to -0.04895735 +-0.65396875 whom to -0.048957378 +-0.91795576 women to -0.048957378 +-0.91795576 applied to -0.048957378 +-0.91795576 comes to -0.048957378 +-0.91795576 nothing to -0.048957378 +-0.65396875 traveling to -0.048957378 +-0.17017452 due to -0.048957378 +-0.65396875 shown to -0.048957378 +-0.65396875 similarities to -0.048957378 +-0.65396875 apt to -0.048957378 +-0.65396875 supplementary to -0.048957378 +-0.65396875 hopes to -0.048957378 +-0.39307514 impossible to -0.048957378 +-0.65396875 applies to -0.048957378 +-0.65396875 introduced to -0.048957378 +-0.91795576 offered to -0.048957378 +-0.24584402 exposed to -0.04895735 +-1.0599167 40 to -0.048957378 +-0.65396875 thrown to -0.048957378 +-0.91795576 distracting to -0.048957378 +-0.65396875 decides to -0.048957378 +-0.65396875 conducive to -0.048957378 +-1.0756072 down to -0.048957378 +-0.65396875 attracted to -0.048957378 +-0.65396875 continues to -0.048957378 +-0.65396875 access to -0.048957378 +-1.5600798 restaurants to -0.048957378 +-1.0599167 man to -0.048957378 +-0.91795576 attention to -0.048957378 +-0.65396875 asked to -0.048957378 +-0.65396875 pertain to -0.048957378 +-0.65396875 admitted to -0.048957378 +-0.65396875 commuting to -0.048957378 +-0.39307514 opposed to -0.048957378 +-0.65396875 safe-guarded to -0.048957378 +-0.65396875 hurry to -0.048957378 +-0.65396875 Peter to -0.048957378 +-1.1532133 approach to -0.048957378 +-0.91795576 aside to -0.048957378 +-0.65396875 advisable to -0.048957378 +-1.1532133 rights to -0.048957378 +-0.65396875 compelled to -0.048957378 +-0.65396875 harm to -0.048957378 +-0.91795576 justification to -0.048957378 +-0.65396875 campaigns to -0.048957378 +-0.65396875 prone to -0.048957378 +-0.39307514 exposure to -0.048957378 +-0.65396875 close to -0.048957378 +-0.65396875 violently to -0.048957378 +-0.65396875 insult to -0.048957378 +-0.39307514 forced to -0.04895735 +-1.0424002 with their -0.04895735 +-1.3507061 for their -0.071063936 +-2.3389277 that their -0.048957378 +-2.1684217 is their -0.048957378 +-2.285103 , their -0.04895735 +-1.0393417 by their -0.09038658 +-1.0680778 in their -0.11297751 +-1.813698 and their -0.09038658 +-1.2063874 of their -0.19241637 +-1.52725 to their -0.11297751 +-1.9649653 studying their -0.048957378 +-1.0653949 from their -0.0796529 +-2.2431486 as their -0.048957378 +-2.3952057 students their -0.048957378 +-0.84646785 on their -0.38513482 +-2.0188966 also their -0.048957378 +-2.2643657 work their -0.048957378 +-2.0516112 if their -0.048957378 +-1.3922343 pursuing their -0.048957378 +-1.730854 pay their -0.048957378 +-1.7700547 have their -0.04895735 +-0.6799947 select their -0.048957378 +-1.7998925 through their -0.048957378 +-1.8990313 into their -0.048957378 +-1.4551702 spend their -0.048957378 +-1.4933755 put their -0.048957378 +-1.6634643 when their -0.048957378 +-2.0565314 do their -0.048957378 +-1.578171 use their -0.13618872 +-1.6084471 know their -0.048957378 +-1.9900225 what their -0.048957378 +-1.9810973 after their -0.048957378 +-1.3254592 encourage their -0.048957378 +-1.2977935 all their -0.048957378 +-1.8350288 If their -0.048957378 +-1.6030769 get their -0.048957378 +-1.8157326 how their -0.048957378 +-1.5090977 entering their -0.048957378 +-0.6799947 neglecting their -0.048957378 +-1.8217303 make their -0.048957378 +-0.4696877 managing their -0.11268411 +-0.83904034 during their -0.11268411 +-1.5043861 earning their -0.048957378 +-1.6353966 enjoy their -0.048957378 +-1.2413204 balancing their -0.048957378 +-1.3254592 improve their -0.048957378 +-1.8789842 out their -0.048957378 +-1.806351 about their -0.048957378 +-1.4471194 either their -0.048957378 +-1.2861439 support their -0.048957378 +-1.1295397 keep their -0.048957378 +-1.1295397 building their -0.048957378 +-0.6799947 divide their -0.048957378 +-0.9311309 let their -0.048957378 +-1.4471194 quit their -0.048957378 +-0.96701777 sacrifice their -0.048957378 +-1.6240298 before their -0.048957378 +-1.4988459 enter their -0.048957378 +-0.96701777 hinder their -0.048957378 +-1.4471194 until their -0.048957378 +-0.6799947 maintaining their -0.048957378 +-1.7203138 getting their -0.048957378 +-0.9311309 change their -0.048957378 +-0.6799947 furthering their -0.048957378 +-1.4988459 between their -0.048957378 +-0.6799947 dip their -0.048957378 +-1.4632802 appreciate their -0.048957378 +-0.9311309 waste their -0.187397 +-1.408859 reduce their -0.048957378 +-1.1295397 reach their -0.048957378 +-1.1295397 sure their -0.048957378 +-1.2413204 build their -0.048957378 +-0.96701777 securing their -0.048957378 +-1.1295397 ask their -0.048957378 +-1.1295397 planning their -0.048957378 +-0.96701777 damage their -0.048957378 +-0.6799947 augment their -0.048957378 +-0.96701777 finish their -0.048957378 +-1.4471194 lose their -0.048957378 +-1.5677247 balance their -0.048957378 +-1.4038733 assist their -0.048957378 +-0.9838272 complete their -0.048957378 +-0.9838272 develop their -0.187397 +-1.2413204 pass their -0.048957378 +-1.2413204 towards their -0.048957378 +-1.1295397 receive their -0.048957378 +-0.6799947 raise their -0.048957378 +-1.2413204 given their -0.048957378 +-1.2413204 using their -0.048957378 +-1.3314085 within their -0.048957378 +-0.6799947 mortgaging their -0.048957378 +-0.6799947 shops their -0.048957378 +-1.1295397 leaving their -0.048957378 +-0.6799947 assessing their -0.048957378 +-0.6799947 spread their -0.048957378 +-1.2413204 finally their -0.048957378 +-0.6799947 Throughout their -0.048957378 +-0.6799947 relieve their -0.048957378 +-0.6799947 funding their -0.048957378 +-0.96701777 beginning their -0.048957378 +-0.6799947 sharpen their -0.048957378 +-0.6799947 shape their -0.048957378 +-1.1295397 nonetheless their -0.048957378 +-0.6799947 compromise their -0.048957378 +-0.6799947 regarding their -0.048957378 +-3.3624787 the chosen -0.048957378 +-1.9252394 their chosen -0.048957378 +-2.7835681 have chosen -0.048957378 +-2.5221097 the course -0.04895735 +-2.047408 of course -0.14912641 +-1.5326759 chosen course -0.048957378 +-1.4667772 whether course -0.048957378 +-0.9933765 Of course -0.04895735 +-0.6934062 core course -0.048957378 +-0.9933765 setting course -0.048957378 +-0.9933765 heavy course -0.048957378 +-2.7422526 for study -0.048957378 +-2.8282797 a study -0.048957378 +-1.6573156 full-time study -0.048957378 +-2.980812 , study -0.048957378 +-2.525486 not study -0.048957378 +-2.51364 and study -0.048957378 +-1.8411663 of study -0.2739523 +-1.7749742 to study -0.15352628 +-2.0656435 their study -0.04895735 +-2.7444193 students study -0.048957378 +-1.8437781 less study -0.048957378 +-1.670574 available study -0.048957378 +-2.143244 only study -0.048957378 +-2.0623705 or study -0.048957378 +-1.8682971 academic study -0.048957378 +-2.007869 good study -0.048957378 +-1.9528555 doing study -0.048957378 +-2.331387 them study -0.048957378 +-1.9919091 must study -0.048957378 +-1.1618836 beyond study -0.048957378 +-2.2708902 my study -0.048957378 +-1.884701 hard study -0.048957378 +-1.7992023 day study -0.048957378 +-1.5132296 All study -0.048957378 +-0.98780924 effective study -0.048957378 +-0.98780924 reduces study -0.048957378 +-0.69060683 uninteresting study -0.048957378 +-0.98780924 depth study -0.048957378 +-1.5273188 For -0.33716017 +-3.0382125 for example -0.048957378 +-1.7707571 For example -0.75667316 +-2.5749965 with studying -0.048957378 +-2.8046703 for studying -0.187397 +-2.4597433 is studying -0.187397 +-2.5009751 student studying -0.048957378 +-2.78825 , studying -0.048957378 +-2.328257 by studying -0.048957378 +-2.869348 in studying -0.048957378 +-2.5660546 and studying -0.048957378 +-2.589116 of studying -0.048957378 +-2.1177297 on studying -0.04895735 +-1.8882614 time studying -0.04895735 +-2.252633 are studying -0.048957378 +-2.0566664 spend studying -0.048957378 +-1.4786097 while studying -0.13618872 +-2.0345504 because studying -0.048957378 +-1.1424879 were studying -0.048957378 +-1.5750064 someone studying -0.048957378 +-1.8892066 those studying -0.048957378 +-1.9090905 years studying -0.048957378 +-1.6259912 spent studying -0.048957378 +-1.1629949 finished studying -0.048957378 +-3.2102208 and engineering -0.048957378 +-2.201754 studying engineering -0.048957378 +-2.453272 an engineering -0.048957378 +-0.69396824 mechanical engineering -0.048957378 +-2.5802724 it has -0.048957378 +-2.0747159 that has -0.04895735 +-2.570245 job has -0.048957378 +-2.475432 student has -0.048957378 +-2.759342 , has -0.04895735 +-2.7795007 and has -0.048957378 +-1.2829179 engineering has -0.048957378 +-1.9277668 reason has -0.048957378 +-1.6864614 one has -0.048957378 +-2.6094086 work has -0.048957378 +-1.161342 market has -0.048957378 +-1.6849463 who has -0.048957378 +-2.141174 which has -0.048957378 +-2.1820824 education has -0.048957378 +-2.0815635 world has -0.048957378 +-1.5707507 someone has -0.048957378 +-1.7713375 workplace has -0.048957378 +-1.7355181 individual has -0.048957378 +-1.7397609 he has -0.048957378 +-0.98863983 teacher has -0.048957378 +-0.98863983 Poker has -0.048957378 +-1.161342 poker has -0.048957378 +-0.69102556 trend has -0.048957378 +-1.9980528 smoke has -0.048957378 +-0.69102556 air has -0.048957378 +-1.9470313 be very -0.04895735 +-2.615849 a very -0.04895735 +-1.8774816 is very -0.19131482 +-2.6890378 the very -0.048957378 +-2.5951056 not very -0.048957378 +-3.0465307 to very -0.048957378 +-2.1352682 has very -0.048957378 +-2.650844 work very -0.048957378 +-2.4735553 should very -0.048957378 +-2.6454384 college very -0.048957378 +-2.034786 become very -0.048957378 +-0.9903057 pays very -0.048957378 +-1.7829504 are very -0.11268411 +-2.2979062 fs very -0.048957378 +-1.7080401 still very -0.048957378 +-1.6702625 worked very -0.048957378 +-1.833048 was very -0.048957378 +-1.2862043 its very -0.048957378 +-0.6918643 weighed very -0.048957378 +-1.817344 with little -0.04895735 +-2.095488 having little -0.048957378 +-1.9739912 a little -0.12183642 +-3.126527 of little -0.048957378 +-1.5786462 very little -0.048957378 +-2.686682 as little -0.048957378 +-2.2040818 have little -0.187397 +-2.2654846 what little -0.048957378 +-2.3150349 that experience -0.048957378 +-2.4861102 job experience -0.048957378 +-0.92697257 valuable experience -0.11268411 +-2.2293139 the experience -0.099168696 +-2.4641187 will experience -0.048957378 +-1.2758812 acquire experience -0.048957378 +-2.4482787 and experience -0.048957378 +-2.7416196 of experience -0.048957378 +-2.284967 to experience -0.048957378 +-2.4964895 as experience -0.048957378 +-2.6798315 students experience -0.048957378 +-1.5572013 Such experience -0.048957378 +-2.5046253 on experience -0.048957378 +-2.2338045 also experience -0.048957378 +-1.2813437 work experience -0.14843592 +-2.2583182 an experience -0.048957378 +-2.2564173 may experience -0.048957378 +-2.1128004 college experience -0.048957378 +-2.5140097 have experience -0.048957378 +-2.156075 some experience -0.048957378 +-1.9588021 first experience -0.048957378 +-2.0042908 This experience -0.048957378 +-2.2932804 them experience -0.048957378 +-1.0519074 useful experience -0.048957378 +-1.156013 relevant experience -0.048957378 +-1.4811971 life experience -0.13618872 +-0.9850521 providing experience -0.048957378 +-0.7058281 actual experience -0.048957378 +-2.002377 learning experience -0.048957378 +-2.2295964 my experience -0.048957378 +-0.68921393 hands-on experience -0.048957378 +-1.5636735 hand experience -0.048957378 +-0.9850521 direct experience -0.048957378 +-0.68921393 worthwhile experience -0.048957378 +-1.3677793 invaluable experience -0.048957378 +-0.68921393 real-world experience -0.048957378 +-0.68921393 Work experience -0.048957378 +-0.68921393 flipping experience -0.048957378 +-2.1210215 can gain -0.048957378 +-2.7056403 and gain -0.04895735 +-2.2480474 to gain -0.09038658 +-2.933356 students gain -0.048957378 +-2.392312 also gain -0.048957378 +-2.0019288 financial gain -0.048957378 +-2.2952857 student from -0.048957378 +-2.4617667 and from -0.048957378 +-1.1984419 gain from -0.048957378 +-2.2938082 working from -0.048957378 +-2.5599706 students from -0.048957378 +-2.1040056 time from -0.048957378 +-1.9959579 but from -0.048957378 +-2.04196 only from -0.048957378 +-1.892135 money from -0.048957378 +-1.0453532 benefit from -0.048957378 +-0.686027 Apart from -0.048957378 +-1.1467421 gained from -0.048957378 +-1.7113171 friends from -0.048957378 +-1.8060203 people from -0.048957378 +-0.4062422 borrowed from -0.048957378 +-2.2165635 them from -0.048957378 +-1.5849212 useful from -0.048957378 +-1.1467421 Learning from -0.048957378 +-1.6411357 get from -0.048957378 +-0.17476808 away from -0.08422062 +-1.2637045 lot from -0.048957378 +-1.5376908 graduate from -0.048957378 +-0.686027 acquired from -0.048957378 +-0.5505629 transition from -0.13618872 +-1.4665623 learning from -0.048957378 +-0.87467194 different from -0.048957378 +-1.8146441 support from -0.048957378 +-1.4278648 free from -0.048957378 +-1.8271488 those from -0.048957378 +-1.4973841 come from -0.048957378 +-0.686027 exclusively from -0.048957378 +-0.686027 scholarships from -0.048957378 +-0.97877645 adapt from -0.048957378 +-0.686027 leap from -0.048957378 +-1.5780652 fully from -0.048957378 +-0.4062422 escape from -0.048957378 +-0.686027 Wages from -0.048957378 +-1.8989673 things from -0.048957378 +-0.97877645 suffering from -0.048957378 +-0.4062422 refrain from -0.048957378 +-1.7127738 home from -0.048957378 +-0.686027 ranging from -0.048957378 +-1.5814797 independence from -0.048957378 +-0.97877645 view from -0.048957378 +-0.686027 hide from -0.048957378 +-0.97877645 break from -0.048957378 +-0.97877645 direct from -0.048957378 +-1.4937676 save from -0.048957378 +-0.97877645 cities from -0.048957378 +-0.97877645 profit from -0.048957378 +-0.9394727 banned from -0.048957378 +-1.1467421 graduating from -0.048957378 +-0.686027 Incomes from -0.048957378 +-0.686027 comprehend from -0.048957378 +-0.686027 vary from -0.048957378 +-0.686027 contact from -0.048957378 +-0.686027 lacking from -0.048957378 +-0.686027 evolved from -0.048957378 +-0.97877645 distract from -0.048957378 +-0.686027 fatigued from -0.048957378 +-0.686027 occur from -0.048957378 +-2.4315228 with working -0.048957378 +-2.3118384 for working -0.048957378 +-1.271607 While working -0.048957378 +-1.8638816 that working -0.048957378 +-2.494195 the working -0.048957378 +-1.9875681 , working -0.11268411 +-2.4128027 not working -0.048957378 +-1.4808807 by working -0.04895735 +-2.092994 and working -0.048957378 +-2.253649 of working -0.04895735 +-2.531885 their working -0.187397 +-1.7936449 course working -0.048957378 +-1.8134881 experience working -0.048957378 +-1.2292035 from working -0.09038658 +-2.4572465 as working -0.048957378 +-2.2848065 students working -0.04895735 +-1.6749804 time working -0.11268411 +-2.085097 only working -0.048957378 +-1.758991 And working -0.048957378 +-1.499587 does working -0.048957378 +-2.4525099 are working -0.048957378 +-1.3625039 Part-time working -0.048957378 +-0.9993211 By working -0.13618872 +-2.2272341 skills working -0.048957378 +-1.7907305 like working -0.048957378 +-1.4665349 while working -0.048957378 +-1.8933928 believe working -0.048957378 +-2.1733258 fs working -0.048957378 +-2.0172663 up working -0.048957378 +-0.982859 continue working -0.048957378 +-1.4968498 either working -0.048957378 +-1.6478957 start working -0.048957378 +-1.9337372 than working -0.048957378 +-1.8603084 hard working -0.048957378 +-1.7028046 always working -0.048957378 +-1.634657 Students working -0.048957378 +-1.6372883 week working -0.048957378 +-1.5490685 both working -0.048957378 +-1.5567847 hand working -0.048957378 +-0.6881027 unhappiness working -0.048957378 +-0.6881027 excessive working -0.048957378 +-1.271607 ever working -0.048957378 +-1.1527659 currently working -0.048957378 +-0.6881027 tedious working -0.048957378 +-0.6881027 anybody working -0.048957378 +-0.982859 smoke-free working -0.048957378 +-2.30506 part-time as -0.048957378 +-2.0112605 job as -0.048957378 +-2.4123163 is as -0.048957378 +-2.22024 student as -0.048957378 +-1.7778959 , as -0.20951243 +-1.8620291 often as -0.048957378 +-1.8715571 employment as -0.048957378 +-2.3517635 and as -0.048957378 +-1.3448838 nature as -0.048957378 +-2.085475 study as -0.048957378 +-1.7733369 experience as -0.048957378 +-2.2311406 working as -0.048957378 +-2.4772856 students as -0.048957378 +-1.5151759 Such as -0.048957378 +-2.0158684 studies as -0.048957378 +-1.813319 jobs as -0.048957378 +-1.7292166 focus as -0.048957378 +-1.8940479 -LRB- as -0.048957378 +-0.5155745 such as -0.048957378 +-2.3186307 have as -0.048957378 +-1.8687097 money as -0.048957378 +-1.7052932 And as -0.048957378 +-1.9511064 you as -0.048957378 +-1.9315999 or as -0.048957378 +-1.8688633 well as -0.0796529 +-1.5196023 benefits as -0.048957378 +-2.1360562 skills as -0.048957378 +-2.145555 important as -0.048957378 +-1.8195623 themselves as -0.048957378 +-1.7270309 environment as -0.048957378 +-1.5278686 position as -0.048957378 +-1.5657564 financially as -0.048957378 +-1.9315103 spend as -0.048957378 +-1.3448838 times as -0.048957378 +-1.6925508 friends as -0.048957378 +-1.5151759 put as -0.048957378 +-2.1341243 people as -0.048957378 +-2.1593525 them as -0.048957378 +-0.57832295 distractions as -0.048957378 +-1.8704551 society as -0.048957378 +-1.8640056 activities as -0.048957378 +-1.8155737 university as -0.048957378 +-1.5657564 finances as -0.048957378 +-1.2533867 pressures as -0.048957378 +-0.6832746 generated as -0.048957378 +-1.7913175 living as -0.048957378 +-1.1388383 almost as -0.048957378 +-0.97339207 require as -0.048957378 +-1.5570483 force as -0.048957378 +-1.6022459 success as -0.048957378 +-1.3401492 interests as -0.048957378 +-2.0178359 learn as -0.048957378 +-1.4796438 beneficial as -0.048957378 +-1.5613806 just as -0.048957378 +-1.4843063 come as -0.048957378 +-1.9127479 much as -0.048957378 +-1.3401492 system as -0.048957378 +-0.6832746 manageable as -0.048957378 +-1.4711914 Working as -0.048957378 +-1.4094145 far as -0.048957378 +-1.8661804 things as -0.048957378 +-1.2533867 serve as -0.048957378 +-1.6929319 days as -0.048957378 +-1.1388383 soon as -0.048957378 +-1.1388383 -- as -0.048957378 +-0.6832746 serves as -0.048957378 +-0.97339207 socially as -0.048957378 +-1.3448838 together as -0.048957378 +-1.1388383 expensive as -0.048957378 +-1.1388383 off as -0.048957378 +-1.1388383 graduating as -0.048957378 +-0.97339207 seen as -0.048957378 +-1.2533867 finally as -0.048957378 +-0.6832746 mocked as -0.048957378 +-0.97339207 hobbies as -0.048957378 +-1.7874773 health as -0.048957378 +-0.6832746 decisions as -0.048957378 +-0.6832746 open as -0.048957378 +-0.97339207 nightclubs as -0.048957378 +-0.6832746 else as -0.048957378 +-1.3448838 smokers as -0.048957378 +-3.3047535 a waiter -0.048957378 +-2.4042459 a restaurant -0.1647075 +-2.5283554 the restaurant -0.04895735 +-3.1476786 of restaurant -0.048957378 +-2.6941473 on restaurant -0.048957378 +-1.9132066 those restaurant -0.048957378 +-0.69354665 fast-food restaurant -0.048957378 +-0.69354665 Exposing restaurant -0.048957378 +-2.7708678 Moreover -0.27955192 +-1.2862043 While there -0.048957378 +-2.8053942 that there -0.048957378 +-1.91234 , there -0.25144583 +-2.5822704 and there -0.048957378 +-2.6069639 as there -0.048957378 +-2.1780512 work there -0.048957378 +-2.048471 then there -0.048957378 +-1.5688267 but there -0.048957378 +-1.1638237 However there -0.048957378 +-2.275442 when there -0.048957378 +-2.0463448 If there -0.048957378 +-1.1638237 Firstly there -0.048957378 +-1.5228355 feel there -0.048957378 +-2.0020888 than there -0.048957378 +-1.3805711 though there -0.048957378 +-1.2862043 Perhaps there -0.048957378 +-1.8917471 think there -0.048957378 +-0.6918643 driving there -0.048957378 +-0.6918643 served there -0.048957378 +-2.106229 having ample -0.048957378 +-3.120598 is ample -0.048957378 +-1.855274 this reason -0.11268411 +-3.042853 the reason -0.048957378 +-0.99169886 ample reason -0.048957378 +-2.1945517 only reason -0.048957378 +-1.2889621 Another reason -0.187397 +-2.3340702 important reason -0.048957378 +-1.4602048 first reason -0.50140065 +-2.048765 good reason -0.048957378 +-1.9400582 other reason -0.048957378 +-1.6335956 main reason -0.048957378 +-1.3554904 no reason -0.187397 +-1.5288866 second reason -0.048957378 +-1.1659027 significant reason -0.048957378 +-1.1659027 obvious reason -0.048957378 +-1.4019599 for students -0.36923593 +-1.3307923 that students -0.100717194 +-2.3124604 a students -0.048957378 +-1.756902 the students -0.16328607 +-1.5387614 , students -0.12614639 +-1.2284902 many students -0.09038658 +-2.232715 and students -0.048957378 +-2.03171 of students -0.048957378 +-2.2857842 to students -0.048957378 +-2.20754 from students -0.048957378 +-2.222474 working students -0.048957378 +-2.307907 as students -0.048957378 +-1.8347775 reason students -0.048957378 +-2.321427 on students -0.048957378 +-2.0178638 The students -0.048957378 +-2.347648 time students -0.048957378 +-2.0649054 school students -0.048957378 +-1.6545906 if students -0.048957378 +-2.030902 such students -0.048957378 +-1.973532 -RRB- students -0.048957378 +-0.61004657 college students -0.42267904 +-1.0082663 some students -0.09038658 +-1.0149802 most students -0.09038658 +-1.3382854 Most students -0.048957378 +-1.5629534 benefit students -0.048957378 +-1.2962623 ; students -0.048957378 +-2.070681 more students -0.048957378 +-0.99064183 helps students -0.187397 +-2.230422 at students -0.048957378 +-1.1539278 Many students -0.048957378 +-0.6715578 gives students -0.04895735 +-1.0388963 College students -0.15838256 +-0.96314585 help students -0.048957378 +-2.1083074 when students -0.048957378 +-2.0365932 what students -0.048957378 +-1.3382854 encourage students -0.048957378 +-1.3067409 all students -0.20946135 +-1.1615238 Japanese students -0.048957378 +-1.5123903 therefore students -0.048957378 +-1.3183289 provide students -0.187397 +-1.8779378 If students -0.048957378 +-1.5123903 teach students -0.048957378 +-1.0831242 give students -0.04895735 +-0.37547976 fellow students -0.13618872 +-1.8364648 believe students -0.048957378 +-0.6828633 recently-graduated students -0.048957378 +-0.8165399 university students -0.11268411 +-1.7050076 A students -0.048957378 +-2.0056694 which students -0.048957378 +-1.9798762 these students -0.048957378 +-1.137665 allowing students -0.048957378 +-0.6828633 encourages students -0.048957378 +-1.4120058 provides students -0.048957378 +-1.0780463 where students -0.04895735 +-0.4049682 expose students -0.048957378 +-1.2518599 requires students -0.048957378 +-1.4688417 feel students -0.187397 +-0.4735685 allows students -0.11268411 +-1.4641747 So students -0.048957378 +-1.8527665 than students -0.048957378 +-1.4688417 Some students -0.048957378 +-1.5713763 possible students -0.048957378 +-1.8039696 years students -0.048957378 +-1.4072294 poor students -0.048957378 +-1.3382854 allow students -0.048957378 +-0.9771071 young students -0.048957378 +-1.137665 enables students -0.048957378 +-1.3523244 teaches students -0.048957378 +-1.721344 want students -0.048957378 +-1.5584444 why students -0.048957378 +-0.6828633 8,000 students -0.048957378 +-1.4072294 few students -0.048957378 +-1.2518599 current students -0.048957378 +-0.8715577 assist students -0.048957378 +-0.9725902 Otherwise students -0.048957378 +-1.137665 busy students -0.048957378 +-0.9725902 unproductive students -0.048957378 +-1.137665 off students -0.048957378 +-0.9725902 brings students -0.048957378 +-0.6828633 irresponsible students -0.048957378 +-0.6828633 pushing students -0.048957378 +-0.6828633 needy students -0.048957378 +-0.6828633 inspire students -0.048957378 +-0.9725902 spoiled students -0.048957378 +-2.7556913 not engage -0.048957378 +-2.90557 to engage -0.187397 +-1.9188446 They engage -0.048957378 +-2.0134587 Such -0.12570384 +-2.0510054 often mean -0.048957378 +-2.4158804 also mean -0.048957378 +-2.7187858 with considerable -0.048957378 +-3.2649243 a considerable -0.048957378 +-1.5948578 meet considerable -0.048957378 +-2.8272688 the amount -0.187397 +-1.16925 considerable amount -0.048957378 +-1.2934115 small amount -0.187397 +-1.16925 significant amount -0.048957378 +-1.5913469 large amount -0.048957378 +-0.69368714 budgeted amount -0.048957378 +-2.6128922 student additional -0.048957378 +-3.1699133 of additional -0.048957378 +-2.7018807 on additional -0.048957378 +-2.4390335 an additional -0.048957378 +-2.7526727 have additional -0.048957378 +-1.2934115 requires additional -0.048957378 +-1.4689857 additional stress -0.048957378 +-2.3479426 The stress -0.048957378 +-1.16925 cause stress -0.048957378 +-1.4679527 added stress -0.048957378 +-1.16925 causes stress -0.048957378 +-0.9939372 creates stress -0.048957378 +-2.0174937 it on -0.04895735 +-2.253122 be on -0.048957378 +-2.0092933 job on -0.04895735 +-2.2296135 , on -0.20946135 +-2.2493231 not on -0.048957378 +-2.3469317 and on -0.048957378 +-1.9975538 has on -0.048957378 +-1.413362 stress on -0.048957378 +-1.7530437 less on -0.048957378 +-2.3546855 time on -0.048957378 +-0.7099427 focus on -0.28567907 +-0.9731246 effectively on -0.048957378 +-2.0464685 work on -0.048957378 +-1.8919449 -LRB- on -0.048957378 +-2.2467754 money on -0.048957378 +-2.036208 so on -0.048957378 +-0.9731246 guarantee on -0.048957378 +-2.1857195 or on -0.048957378 +-1.9111866 even on -0.048957378 +-2.2686563 are on -0.048957378 +-2.0765378 more on -0.048957378 +-0.5434636 taking on -0.048957378 +-1.9297018 spend on -0.048957378 +-0.6831375 strains on -0.048957378 +-1.408685 effort on -0.048957378 +-1.6137459 something on -0.048957378 +-1.8110754 take on -0.048957378 +-1.7904631 classes on -0.048957378 +-1.8622386 activities on -0.048957378 +-1.1384469 strongly on -0.048957378 +-1.9392554 up on -0.048957378 +-0.9731246 depends on -0.048957378 +-1.8099428 going on -0.048957378 +-1.917756 out on -0.048957378 +-0.54891545 impact on -0.13618872 +-0.9731246 determination on -0.048957378 +-0.2700946 based on -0.048957378 +-1.5730753 possible on -0.048957378 +-0.40507883 relying on -0.187397 +-1.596889 worked on -0.048957378 +-1.344314 back on -0.048957378 +-0.7942637 concentrate on -0.048957378 +-0.6831375 role on -0.048957378 +-1.1434696 participation on -0.048957378 +-0.8705654 effect on -0.048957378 +-1.413362 takes on -0.187397 +-1.794955 think on -0.048957378 +-0.47110486 focused on -0.04895735 +-0.57823205 moving on -0.048957378 +-0.9731246 dependent on -0.048957378 +-0.9731246 completed on -0.048957378 +-1.2528772 build on -0.048957378 +-1.344314 early on -0.048957378 +-0.9731246 % on -0.048957378 +-0.6831375 cheaper on -0.048957378 +-0.6831375 strictly on -0.048957378 +-0.7002525 concentration on -0.048957378 +-0.2520104 rely on -0.04895735 +-0.2520104 concentrating on -0.27955192 +-0.9731246 influence on -0.048957378 +-0.6831375 Depending on -0.048957378 +-0.6831375 priority on -0.048957378 +-0.6831375 action on -0.048957378 +-1.339527 invaluable on -0.048957378 +-1.039454 spent on -0.048957378 +-1.344314 public on -0.048957378 +-1.2528772 purely on -0.048957378 +-0.6831375 Or on -0.048957378 +-1.1384469 expense on -0.048957378 +-0.6831375 strain on -0.048957378 +-0.9731246 pressure on -0.187397 +-1.1384469 difficulties on -0.048957378 +-0.40507883 hands on -0.187397 +-0.6831375 Based on -0.187397 +-1.2528772 groups on -0.048957378 +-0.6831375 lounging on -0.048957378 +-0.6831375 depend on -0.048957378 +-0.6831375 Queens on -0.048957378 +-0.6831375 influences on -0.048957378 +-0.3484125 ban on -0.1722646 +-0.40507883 restrictions on -0.048957378 +-0.6831375 severe on -0.048957378 +-3.4000094 the top -0.048957378 +-2.1578155 on top -0.187397 +-3.0178597 that created -0.048957378 +-2.8775895 for studies -0.048957378 +-2.7371416 the studies -0.048957378 +-3.01755 of studies -0.048957378 +-1.3222532 their studies -0.24477966 +-1.9955268 from studies -0.048957378 +-2.1321971 on studies -0.048957378 +-2.016018 's studies -0.13618872 +-1.8961413 academic studies -0.048957378 +-1.8040662 my studies -0.048957378 +-1.384001 University studies -0.048957378 +-1.939229 our studies -0.048957378 +-1.4599779 his studies -0.048957378 +-1.3848785 daily studies -0.048957378 +-1.3848785 tertiary studies -0.048957378 +-1.2027681 The -0.12879533 +-1.8006662 I also -0.04895735 +-2.5483181 it also -0.048957378 +-1.1931207 can also -0.07613316 +-2.421837 part-time also -0.048957378 +-2.1256256 job also -0.04895735 +-1.687835 is also -0.14477092 +-1.7873261 will also -0.13618872 +-2.9581296 , also -0.048957378 +-2.7188857 and also -0.048957378 +-2.2683425 also also -0.048957378 +-2.5948405 time also -0.048957378 +-2.3574555 jobs also -0.048957378 +-2.2836058 may also -0.048957378 +-1.0426338 but also -0.04895735 +-1.9413786 should also -0.187397 +-2.6309447 they also -0.048957378 +-1.9729319 's also -0.048957378 +-1.8676002 ; also -0.048957378 +-2.3226044 people also -0.048957378 +-1.8593925 They also -0.048957378 +-1.2801979 fre also -0.048957378 +-2.0210266 while also -0.048957378 +-1.2801979 fll also -0.048957378 +-1.5740396 could also -0.048957378 +-1.9875935 must also -0.048957378 +-1.8366562 It also -0.048957378 +-2.2427747 fs also -0.048957378 +-1.7291627 would also -0.048957378 +-1.6547388 Students also -0.048957378 +-1.5113788 us also -0.048957378 +-2.3119407 be increased -0.048957378 +-3.0543673 a less -0.048957378 +-2.506663 is less -0.048957378 +-2.6443284 and less -0.04895735 +-3.1719224 to less -0.048957378 +-2.6559741 as less -0.048957378 +-1.822755 focus less -0.048957378 +-2.1929817 have less -0.048957378 +-1.988769 doing less -0.048957378 +-1.4629681 usually less -0.048957378 +-2.088065 much less -0.048957378 +-1.4629681 far less -0.048957378 +-2.027863 smoke less -0.048957378 +-2.1932259 this time -0.048957378 +-2.2431653 a time -0.1722646 +-1.7900798 the time -0.106249355 +-2.1072464 , time -0.09038658 +-1.8870753 of time -0.07613316 +-2.693594 to time -0.048957378 +-1.441317 their time -0.04895735 +-1.0931581 study time -0.099168696 +-0.9806769 ample time -0.048957378 +-1.7774608 on time -0.13618872 +-1.2841984 less time -0.048957378 +-0.7714597 available time -0.04895735 +-2.4307246 have time -0.187397 +-0.9148412 extra time -0.04895735 +-2.1108239 some time -0.048957378 +-1.2673745 - time -0.048957378 +-2.3086903 or time -0.048957378 +-0.22725546 part time -0.74411327 +-1.1486357 more time -0.099168696 +-1.7469848 important time -0.187397 +-1.5856681 same time -0.23812707 +-1.9231375 first time -0.04895735 +-1.379641 enough time -0.187397 +-0.73934513 spend time -0.0796529 +-1.9613218 This time -0.048957378 +-1.7547461 your time -0.048957378 +-1.8602109 take time -0.048957378 +-1.7376593 fs time -0.11268411 +-1.3317025 no time -0.048957378 +-1.4897636 quite time -0.048957378 +-1.78098 find time -0.048957378 +-0.32378548 full time -0.048957378 +-0.70379233 limited time -0.048957378 +-1.2673745 requires time -0.048957378 +-0.8760494 free time -0.048957378 +-1.3572918 manage time -0.048957378 +-1.9732132 much time -0.048957378 +-2.170744 my time -0.048957378 +-1.2673745 short time -0.048957378 +-0.8760494 takes time -0.048957378 +-0.68699443 Part time -0.20946135 +-1.4961216 waste time -0.048957378 +-1.1495428 easier time -0.048957378 +-1.92054 his time -0.048957378 +-0.68699443 arranging time -0.048957378 +-0.9806769 excellent time -0.048957378 +-1.2673745 using time -0.048957378 +-1.2673745 over time -0.048957378 +-0.68699443 special time -0.048957378 +-0.40663064 transitional time -0.048957378 +-0.68699443 opportune time -0.048957378 +-0.68699443 in-class time -0.048957378 +-0.9806769 reasonable time -0.048957378 +-0.68699443 His time -0.048957378 +-2.9718654 is available -0.048957378 +-3.1340673 the available -0.048957378 +-2.080413 any available -0.048957378 +-2.8794727 their available -0.048957378 +-1.8775834 little available -0.048957378 +-2.2197218 time available -0.187397 +-2.4608476 jobs available -0.048957378 +-2.3679307 more available -0.048957378 +-2.3488793 fs available -0.048957378 +-1.1675731 Jobs available -0.048957378 +-2.596728 to accomplish -0.04895735 +-3.09512 Additionally -0.187397 +-0.5181642 part-time jobs -0.23893401 +-1.6560253 full-time jobs -0.048957378 +-2.2445977 many jobs -0.048957378 +-2.0278883 any jobs -0.048957378 +-2.7948785 in jobs -0.048957378 +-2.5436628 of jobs -0.048957378 +-2.063378 their jobs -0.04895735 +-2.2222443 The jobs -0.048957378 +-1.6877759 time jobs -0.24943376 +-1.669731 available jobs -0.048957378 +-1.5665365 f jobs -0.048957378 +-2.1343203 such jobs -0.048957378 +-2.069751 most jobs -0.048957378 +-2.5768034 are jobs -0.048957378 +-1.3737917 Part-time jobs -0.099168696 +-2.2605467 part jobs -0.048957378 +-1.9802302 first jobs -0.048957378 +-1.9060905 other jobs -0.048957378 +-2.1893034 what jobs -0.048957378 +-2.316782 all jobs -0.048957378 +-1.7624514 real jobs -0.048957378 +-2.1844375 get jobs -0.048957378 +-1.5682971 graduate jobs -0.048957378 +-1.3737917 basic jobs -0.048957378 +-0.69046736 Student jobs -0.048957378 +-0.69046736 Real-world jobs -0.048957378 +-1.1596954 regular jobs -0.048957378 +-0.69046736 offering jobs -0.048957378 +-0.69046736 low-skilled jobs -0.048957378 +-2.6886988 can leave -0.048957378 +-2.8944666 to leave -0.048957378 +-2.831353 they leave -0.048957378 +-1.7066134 we leave -0.048957378 +-2.4945273 with one -0.048957378 +-2.0841045 for one -0.13618872 +-2.5183668 be one -0.048957378 +-2.679537 that one -0.048957378 +-2.050633 having one -0.048957378 +-2.4114535 part-time one -0.048957378 +-2.3998318 is one -0.048957378 +-2.7534437 the one -0.048957378 +-2.4952023 , one -0.04895735 +-2.7540843 in one -0.048957378 +-2.1237957 of one -0.187397 +-2.8597465 to one -0.048957378 +-2.5175261 as one -0.048957378 +-2.5249085 on one -0.048957378 +-2.2507303 also one -0.048957378 +-1.4451655 leave one -0.048957378 +-2.2078345 if one -0.048957378 +-1.4451655 pursuing one -0.048957378 +-2.5675447 college one -0.048957378 +-2.4022777 money one -0.048957378 +-1.780973 : one -0.048957378 +-1.5097663 does one -0.048957378 +-2.254753 do one -0.048957378 +-1.8758186 no one -0.048957378 +-1.9663883 make one -0.048957378 +-1.3728908 during one -0.187397 +-1.9494588 where one -0.048957378 +-1.5613256 perhaps one -0.048957378 +-1.9475634 things one -0.048957378 +-0.6897705 cloud one -0.048957378 +-1.1576457 Reason one -0.048957378 +-0.6897705 complement one -0.048957378 +-0.6897705 complements one -0.048957378 +-0.6897705 perfected one -0.048957378 +-3.3971481 , tired -0.048957378 +-2.2791026 one tired -0.048957378 +-1.8616352 being tired -0.048957378 +-2.2770538 so tired -0.048957378 +-0.6938276 physically tired -0.048957378 +-3.2799814 and thus -0.048957378 +-0.6942267 horizons thus -0.048957378 +-1.8347663 cases unable -0.048957378 +-3.2102208 and unable -0.048957378 +-1.1700908 thus unable -0.048957378 +-2.8669279 are unable -0.048957378 +-2.7912447 the focus -0.048957378 +-3.0629127 and focus -0.048957378 +-3.1063578 of focus -0.048957378 +-2.059188 to focus -0.13332285 +-2.8945246 their focus -0.048957378 +-1.6463084 should focus -0.40449065 +-1.5885828 today focus -0.048957378 +-1.4654533 colleges focus -0.048957378 +-2.0975585 much focus -0.048957378 +-1.8378968 focus effectively -0.048957378 +-1.4711709 tasks effectively -0.048957378 +-2.5736907 it ? -0.048957378 +-2.5631618 job ? -0.048957378 +-2.821331 in ? -0.048957378 +-2.3400064 experience ? -0.048957378 +-2.336049 students ? -0.187397 +-2.1402833 studies ? -0.048957378 +-2.6202376 time ? -0.048957378 +-2.4346166 money ? -0.048957378 +-2.2894611 skills ? -0.048957378 +-2.1795027 education ? -0.048957378 +-2.078757 world ? -0.048957378 +-0.69088596 subjective ? -0.048957378 +-0.9883628 straightforward ? -0.048957378 +-1.7010977 lives ? -0.048957378 +-1.8239864 Japan ? -0.048957378 +-1.5150883 generally ? -0.048957378 +-1.5166876 anything ? -0.048957378 +-1.2840103 resources ? -0.048957378 +-1.2823725 stage ? -0.048957378 +-0.69088596 authorities ? -0.048957378 +-0.69088596 governments ? -0.048957378 +-0.69088596 competency ? -0.048957378 +-0.9883628 attitude ? -0.048957378 +-1.2823725 weekends ? -0.048957378 +-0.69088596 bursary ? -0.048957378 +-1.2823725 here ? -0.048957378 +-3.6294703 resulting -0.048957378 +-3.4000094 the resulting -0.048957378 +-2.7477093 a lower -0.048957378 +-2.5949488 from lower -0.048957378 +-2.85306 the quality -0.187397 +-1.1700908 lower quality -0.048957378 +-2.08053 good quality -0.048957378 +-1.469627 poor quality -0.048957378 +-2.654057 the school -0.048957378 +-2.7735555 , school -0.048957378 +-1.6903458 in school -0.19052841 +-2.5504224 and school -0.048957378 +-2.892758 of school -0.048957378 +-2.988369 to school -0.048957378 +-2.7202551 their school -0.048957378 +-1.6700217 from school -0.13618872 +-2.5861301 on school -0.048957378 +-2.1608806 only school -0.048957378 +-1.3959985 through school -0.187397 +-1.4094224 at school -0.11268411 +-0.47545445 high school -0.13618872 +-1.5450042 after school -0.187397 +-1.3775741 during school -0.048957378 +-1.2840105 balancing school -0.048957378 +-0.8038377 particular school -0.048957378 +-1.5178914 quit school -0.048957378 +-1.5193118 afford school -0.048957378 +-1.9938333 his school -0.048957378 +-0.691305 nursery school -0.048957378 +-0.691305 cram school -0.048957378 +-0.7077454 attending school -0.048957378 +-2.3252554 I work -0.048957378 +-2.0261102 for work -0.04895735 +-1.998533 can work -0.04895735 +-2.2296102 that work -0.048957378 +-2.3698292 a work -0.048957378 +-0.92244244 part-time work -0.0923415 +-2.4741051 is work -0.048957378 +-1.8940754 valuable work -0.048957378 +-1.774281 the work -0.3101605 +-2.1641345 , work -0.048957378 +-2.0179143 not work -0.048957378 +-2.155303 and work -0.04895735 +-1.9547871 of work -0.09038658 +-1.4441775 to work -0.169769 +-2.4058778 their work -0.048957378 +-1.2534463 course work -0.048957378 +-1.7131832 gain work -0.048957378 +-2.3646681 as work -0.048957378 +-2.5292082 students work -0.048957378 +-1.8347353 time work -0.048957378 +-1.6581231 school work -0.048957378 +-2.058534 such work -0.048957378 +-1.9194632 then work -0.048957378 +-2.2906883 should work -0.048957378 +-2.0232077 only work -0.048957378 +-2.430137 college work -0.048957378 +-2.4433575 they work -0.048957378 +-1.2321298 extra work -0.048957378 +-1.6758809 To work -0.048957378 +-2.242867 or work -0.048957378 +-1.3483186 Part-time work -0.048957378 +-1.8967416 future work -0.048957378 +-1.8595383 through work -0.048957378 +-1.643312 who work -0.187397 +-2.274983 at work -0.048957378 +-1.8742472 doing work -0.048957378 +-1.7066337 real work -0.048957378 +-1.527458 therefore work -0.048957378 +-1.1439594 menial work -0.048957378 +-0.97688437 sufficient work -0.048957378 +-1.1481692 prior work -0.048957378 +-1.7603195 find work -0.048957378 +-1.4776343 either work -0.048957378 +-0.68506163 strong work -0.048957378 +-1.9408126 much work -0.048957378 +-0.6600929 hard work -0.14912641 +-0.97688437 definitely work -0.048957378 +-1.26419 America work -0.048957378 +-1.4776343 generally work -0.048957378 +-1.7096123 days work -0.04895735 +-1.1439594 gaining work -0.048957378 +-0.68506163 processes work -0.048957378 +-1.2310314 class work -0.048957378 +-0.68506163 Actual work -0.048957378 +-1.1439594 hierarchies work -0.048957378 +-1.5383855 hand work -0.048957378 +-1.8253148 our work -0.048957378 +-2.0705028 we work -0.048957378 +-1.4776343 her work -0.048957378 +-1.26419 respect work -0.048957378 +-1.1439594 Balancing work -0.048957378 +-1.1439594 regular work -0.048957378 +-0.68506163 volunteer work -0.048957378 +-0.68506163 Basic work -0.048957378 +-0.97688437 Parents work -0.048957378 +-0.68506163 semi-regular work -0.048957378 +-0.68506163 firm work -0.048957378 +-0.68506163 potentially work -0.048957378 +-3.2450058 , being -0.048957378 +-1.8264728 by being -0.048957378 +-2.1737466 of being -0.048957378 +-2.5321739 from being -0.048957378 +-1.9530969 reason being -0.187397 +-2.7130642 work being -0.048957378 +-2.091309 even being -0.048957378 +-1.586511 benefits being -0.048957378 +-1.1671549 subject being -0.048957378 +-1.463795 everything being -0.048957378 +-0.9925369 remember being -0.048957378 +-1.868011 being produced -0.048957378 +-2.174259 Therefore -0.5104914 +-2.5869553 it if -0.048957378 +-2.6021113 be if -0.048957378 +-2.076529 that if -0.04895735 +-1.862392 , if -0.13332285 +-2.5428126 and if -0.187397 +-2.1455538 studies if -0.048957378 +-2.0347157 -LRB- if -0.048957378 +-0.69116527 sought if -0.048957378 +-1.7964928 And if -0.048957378 +-1.519624 even if -0.048957378 +-2.293879 more if -0.048957378 +-2.028992 because if -0.048957378 +-2.1488059 these if -0.048957378 +-1.8033544 customers if -0.048957378 +-1.5774821 value if -0.048957378 +-1.2849771 decide if -0.048957378 +-1.9885925 than if -0.048957378 +-1.6198496 But if -0.048957378 +-1.2834638 determine if -0.048957378 +-1.6271508 later if -0.048957378 +-1.2834638 Even if -0.099168696 +-0.988917 nice if -0.048957378 +-0.69116527 desirable if -0.048957378 +-0.988917 household if -0.048957378 +-2.7426593 be related -0.048957378 +-1.862535 job related -0.50140065 +-2.9560866 is related -0.187397 +-2.668018 not related -0.048957378 +-2.7130642 work related -0.048957378 +-2.7644556 are related -0.048957378 +-2.4043431 all related -0.048957378 +-2.3429112 fs related -0.048957378 +-0.5847311 directly related -0.187397 +-1.8242902 responsibility related -0.048957378 +-2.1247215 smoking related -0.048957378 +-3.4614878 f -0.048957378 +-2.9123535 students f -0.048957378 +-1.5326759 graduates f -0.048957378 +-2.1322756 world f -0.048957378 +-0.6934062 epeople-skills f -0.048957378 +-0.9933765 ereal f -0.048957378 +-1.388153 mind f -0.048957378 +-0.6934062 etime f -0.048957378 +-2.873293 be found -0.048957378 +-1.5379899 positions found -0.048957378 +-3.1628284 -LRB- -0.048957378 +-2.1658301 reasons -LRB- -0.048957378 +-2.8508892 a -LRB- -0.048957378 +-2.5631618 job -LRB- -0.048957378 +-1.5166876 chosen -LRB- -0.048957378 +-2.5685596 on -LRB- -0.048957378 +-2.1402833 studies -LRB- -0.048957378 +-2.6202376 time -LRB- -0.048957378 +-2.6028714 work -LRB- -0.048957378 +-0.9883628 found -LRB- -0.048957378 +-1.4561495 level -LRB- -0.048957378 +-1.9418831 enough -LRB- -0.048957378 +-2.3302038 all -LRB- -0.048957378 +-0.9883628 commitments -LRB- -0.048957378 +-1.3774391 debt -LRB- -0.048957378 +-1.7959839 income -LRB- -0.048957378 +-1.8051797 way -LRB- -0.048957378 +-1.5696933 great -LRB- -0.048957378 +-1.2823725 co-workers -LRB- -0.048957378 +-1.2823725 wanted -LRB- -0.048957378 +-0.9883628 boss -LRB- -0.048957378 +-0.9883628 applied -LRB- -0.048957378 +-0.69088596 Reducing -LRB- -0.048957378 +-2.0827832 smoking -LRB- -0.048957378 +-1.2823725 rights -LRB- -0.048957378 +-0.69088596 alternatively -LRB- -0.048957378 +-2.725951 for such -0.048957378 +-2.5499542 be such -0.048957378 +-2.0572393 having such -0.048957378 +-2.7313151 is such -0.048957378 +-2.5106988 , such -0.27955192 +-2.4391725 in such -0.048957378 +-2.7188857 and such -0.048957378 +-1.9658114 from such -0.048957378 +-2.5396338 as such -0.048957378 +-2.3574555 jobs such -0.048957378 +-2.0175722 -LRB- such -0.048957378 +-2.0770268 but such -0.048957378 +-1.3749874 As such -0.048957378 +-0.6903279 oriented such -0.048957378 +-1.8676002 ; such -0.048957378 +-2.265639 more such -0.048957378 +-1.7679564 skills such -0.187397 +-1.9761689 make such -0.048957378 +-0.6903279 counter such -0.048957378 +-1.8188589 find such -0.048957378 +-2.1308677 learn such -0.048957378 +-1.5132132 afford such -0.048957378 +-1.661639 age such -0.048957378 +-0.9872564 virtues such -0.048957378 +-1.1592846 activity such -0.048957378 +-0.9872564 establishments such -0.048957378 +-0.6903279 obstructs such -0.048957378 +-0.6903279 behoove such -0.048957378 +-0.6903279 built such -0.048957378 +-0.6903279 implementing such -0.048957378 +-2.4889135 with an -0.048957378 +-2.3471477 for an -0.048957378 +-2.1775472 be an -0.048957378 +-1.4638371 is an -0.18543062 +-2.6945138 , an -0.048957378 +-2.4781325 not an -0.048957378 +-2.4207144 in an -0.048957378 +-2.4668753 and an -0.04895735 +-2.7679293 of an -0.048957378 +-2.643432 to an -0.048957378 +-1.4465187 example an -0.048957378 +-2.095947 has an -0.048957378 +-2.3968844 from an -0.048957378 +-1.2920727 as an -0.13332285 +-2.0311358 students an -0.13618872 +-2.5197482 on an -0.048957378 +-1.2721521 such an -0.04895735 +-2.1341379 have an -0.048957378 +-1.7793158 : an -0.048957378 +-2.5287702 are an -0.048957378 +-1.600312 at an -0.04895735 +-1.3662472 take an -0.048957378 +-2.3043654 them an -0.048957378 +-1.3444461 provide an -0.048957378 +-2.1658885 get an -0.048957378 +-2.0097125 out an -0.048957378 +-1.371942 now an -0.048957378 +-0.6896313 Consider an -0.048957378 +-1.2774949 build an -0.048957378 +-0.98587745 play an -0.048957378 +-1.5623918 complete an -0.048957378 +-1.5128433 choose an -0.048957378 +-1.371942 growth an -0.048957378 +-1.2796862 creating an -0.048957378 +-0.6896313 embraces an -0.048957378 +-2.469975 an internship -0.048957378 +-2.3044932 this -RRB- -0.048957378 +-2.5608191 it -RRB- -0.048957378 +-1.9826323 valuable -RRB- -0.048957378 +-1.8437781 less -RRB- -0.048957378 +-1.5203519 ? -RRB- -0.187397 +-0.69060683 internship -RRB- -0.048957378 +-1.5198623 fees -RRB- -0.048957378 +-1.9741774 future -RRB- -0.048957378 +-1.815958 environment -RRB- -0.048957378 +-1.9379536 enough -RRB- -0.048957378 +-1.8746905 classes -RRB- -0.048957378 +-1.7656285 responsibilities -RRB- -0.048957378 +-1.1618836 etc. -RRB- -0.048957378 +-1.2812839 assignments -RRB- -0.048957378 +-1.572441 did -RRB- -0.048957378 +-1.2812839 him -RRB- -0.048957378 +-1.2193218 ! -RRB- -0.187397 +-0.98780924 thing -RRB- -0.048957378 +-0.69060683 i -RRB- -0.048957378 +-0.69060683 ii -RRB- -0.048957378 +-0.98780924 1 -RRB- -0.048957378 +-1.1601064 2 -RRB- -0.048957378 +-1.1601064 3 -RRB- -0.048957378 +-0.69060683 questions -RRB- -0.048957378 +-0.69060683 eliminating -RRB- -0.048957378 +-0.69060683 bills -RRB- -0.048957378 +-0.69060683 renovations -RRB- -0.048957378 +-1.1601064 choices -RRB- -0.048957378 +-2.665704 job then -0.048957378 +-1.8722535 , then -0.06917815 +-2.3709912 and then -0.04895735 +-2.3089478 study then -0.048957378 +-2.3523607 also then -0.048957378 +-1.9133708 jobs then -0.048957378 +-1.8165324 And then -0.048957378 +-2.7391284 are then -0.048957378 +-1.5859333 did then -0.048957378 +-1.8448267 was then -0.048957378 +-0.6927047 affordable then -0.048957378 +-1.2895159 Even then -0.048957378 +-0.6927047 taken-on then -0.048957378 +-1.5218859 this may -0.04895735 +-1.6387134 it may -0.1722646 +-2.6943552 that may -0.048957378 +-2.1205246 job may -0.099168696 +-2.4340558 student may -0.048957378 +-2.2321982 many may -0.048957378 +-2.4863043 and may -0.048957378 +-1.830649 experience may -0.13618872 +-1.6171387 students may -0.09038658 +-2.5826783 time may -0.048957378 +-2.348351 jobs may -0.048957378 +-2.5655887 work may -0.048957378 +-2.0713143 but may -0.048957378 +-2.577158 college may -0.048957378 +-1.9115219 they may -0.048957378 +-2.0510042 or may -0.048957378 +-1.7659475 skills may -0.048957378 +-2.1819563 who may -0.048957378 +-1.7091721 idea may -0.048957378 +-1.1584644 again may -0.048957378 +-1.8552963 They may -0.048957378 +-2.0215673 This may -0.048957378 +-2.2618876 do may -0.048957378 +-1.9534212 society may -0.048957378 +-1.7861421 income may -0.048957378 +-2.0295806 much may -0.048957378 +-1.7613238 workplace may -0.048957378 +-0.9867043 We may -0.048957378 +-1.6933241 person may -0.048957378 +-0.6900491 agency may -0.048957378 +-0.6900491 She may -0.048957378 +-1.3717782 employees may -0.048957378 +-2.880715 be worth -0.048957378 +-0.6938276 worth pursuing -0.048957378 +-2.762729 have pursuing -0.048957378 +-2.322112 are pursuing -0.048957378 +-1.728864 still pursuing -0.048957378 +-2.042275 than pursuing -0.048957378 +-2.656897 job but -0.048957378 +-1.6151335 , but -0.12232355 +-2.706777 time but -0.048957378 +-2.1904237 work but -0.048957378 +-2.1213527 -RRB- but -0.048957378 +-2.471992 at but -0.048957378 +-0.6925645 situations but -0.048957378 +-1.1659027 subjects but -0.048957378 +-1.5288866 books but -0.048957378 +-0.99169886 dishes but -0.048957378 +-1.1659027 -- but -0.048957378 +-1.5264108 lose but -0.048957378 +-0.6925645 salary but -0.048957378 +-1.384001 choice but -0.048957378 +-2.165491 but otherwise -0.048957378 +-1.7598311 would otherwise -0.048957378 +-1.8190807 it should -0.13618872 +-2.3087864 that should -0.048957378 +-2.4744995 job should -0.048957378 +-1.5142306 student should -0.099168696 +-1.9393792 employment should -0.048957378 +-2.6183295 and should -0.048957378 +-2.2061079 study should -0.048957378 +-2.1019301 studying should -0.048957378 +-1.7482382 restaurant should -0.048957378 +-2.0469754 there should -0.048957378 +-1.2276503 students should -0.30548838 +-2.129624 work should -0.048957378 +-2.1115725 such should -0.048957378 +-2.5399191 college should -0.048957378 +-1.2254425 they should -0.106249355 +-2.2786334 people should -0.048957378 +-1.8392867 They should -0.048957378 +-1.6059982 College should -0.187397 +-1.5551537 therefore should -0.048957378 +-1.8175535 It should -0.187397 +-1.9434342 activities should -0.048957378 +-1.7883325 life should -0.187397 +-2.1449103 education should -0.048957378 +-1.155199 offers should -0.048957378 +-1.5046468 universities should -0.048957378 +-1.2748088 courses should -0.048957378 +-1.6420794 Students should -0.048957378 +-1.155199 Jobs should -0.048957378 +-0.6889358 sophomores should -0.048957378 +-1.5551537 companies should -0.048957378 +-1.7202897 he should -0.048957378 +-1.1625504 we should -0.048957378 +-0.6889358 deviation should -0.048957378 +-0.98450285 Parents should -0.048957378 +-1.155199 activity should -0.048957378 +-0.6889358 Governments should -0.048957378 +-0.98450285 none should -0.048957378 +-1.5396936 smoking should -0.40449065 +-0.98450285 Smoking should -0.048957378 +-1.155199 tobacco should -0.048957378 +-2.0901892 can only -0.048957378 +-2.6800249 the only -0.13618872 +-1.6038444 will only -0.048957378 +-2.7957878 , only -0.187397 +-1.1494656 not only -0.071063936 +-2.8499699 and only -0.048957378 +-2.804041 students only -0.048957378 +-2.604442 on only -0.048957378 +-2.2465868 if only -0.048957378 +-1.9552239 should only -0.048957378 +-2.460556 money only -0.048957378 +-2.2308414 they only -0.048957378 +-2.660572 are only -0.048957378 +-0.9900276 history only -0.048957378 +-2.3715343 people only -0.048957378 +-2.2138364 get only -0.048957378 +-1.4599607 used only -0.048957378 +-1.2856548 Not only -0.048957378 +-1.8578128 really only -0.048957378 +-1.8311156 was only -0.048957378 +-2.880715 be sought -0.048957378 +-2.701938 it necessary -0.048957378 +-2.0244992 is necessary -0.15595351 +-2.332233 the necessary -0.048957378 +-2.6787992 as necessary -0.048957378 +-2.632936 or necessary -0.048957378 +-2.7913516 are necessary -0.048957378 +-2.3504152 skills necessary -0.048957378 +-1.6391098 financially necessary -0.048957378 +-2.079207 If necessary -0.048957378 +-2.4197767 for financial -0.187397 +-2.6437428 a financial -0.048957378 +-2.4807384 the financial -0.04895735 +-3.1669025 , financial -0.048957378 +-2.6166396 and financial -0.048957378 +-2.6266284 of financial -0.048957378 +-2.0962207 their financial -0.13618872 +-1.9957508 better financial -0.048957378 +-1.7497337 personal financial -0.048957378 +-0.69242436 grave financial -0.048957378 +-2.1809397 these financial -0.048957378 +-1.2893543 actual financial -0.048957378 +-1.1654861 gaining financial -0.048957378 +-0.99141985 brings financial -0.048957378 +-0.69242436 face financial -0.048957378 +-1.5223992 -LRB- i.e. -0.187397 +-3.0886452 and pay -0.048957378 +-1.834014 to pay -0.19000496 +-2.9123535 students pay -0.048957378 +-1.908163 ft pay -0.048957378 +-1.6456671 help pay -0.13618872 +-2.4193864 them pay -0.048957378 +-1.7243168 n't pay -0.048957378 +-1.5321847 her pay -0.048957378 +-2.938363 for tuition -0.048957378 +-2.7912447 the tuition -0.048957378 +-1.6865133 upon tuition -0.048957378 +-3.0629127 and tuition -0.048957378 +-1.8572801 pay tuition -0.048957378 +-2.1640978 college tuition -0.048957378 +-1.9291258 own tuition -0.048957378 +-2.3621502 my tuition -0.048957378 +-0.6932658 ever-increasing tuition -0.048957378 +-3.2496762 the fees -0.048957378 +-2.9430885 their fees -0.048957378 +-1.1857961 tuition fees -0.048957378 +-2.6663196 or fees -0.048957378 +-1.9390326 university fees -0.048957378 +-1.3899099 tertiary fees -0.048957378 +-2.7708678 Yes -0.27955192 +-1.4926801 I agree -0.6717826 +-1.4679527 completely agree -0.048957378 +-1.9125354 ft agree -0.048957378 +-1.16925 strongly agree -0.048957378 +-1.16925 largely agree -0.048957378 +-0.69368714 totally agree -0.048957378 +-2.8395705 college -0.048957378 +-2.0139132 with college -0.048957378 +-1.8669546 for college -0.9204526 +-1.6301835 that college -0.45249942 +-1.622262 a college -0.62859064 +-2.4844427 is college -0.048957378 +-1.6107941 full-time college -0.048957378 +-1.9867628 the college -0.08422062 +-2.38785 , college -0.13618872 +-1.8732889 In college -0.048957378 +-1.7100332 many college -0.187397 +-1.9408759 any college -0.048957378 +-1.6623528 in college -0.16605413 +-2.4315457 and college -0.048957378 +-1.7414789 of college -0.106249355 +-2.3373091 to college -0.09038658 +-1.7185972 their college -0.1765346 +-1.6750145 For college -0.048957378 +-2.2679257 from college -0.048957378 +-2.07479 The college -0.048957378 +-1.4205079 leave college -0.048957378 +-1.6681337 if college -0.048957378 +-2.0797336 some college -0.048957378 +-1.889248 's college -0.048957378 +-1.9779081 most college -0.048957378 +-2.2516813 or college -0.048957378 +-1.8066765 ; college -0.048957378 +-2.1264741 more college -0.048957378 +-1.0598776 through college -0.04895735 +-1.3925662 at college -0.1765346 +-1.6784805 Many college -0.187397 +-2.148532 when college -0.048957378 +-0.6193371 after college -0.27406517 +-1.7849913 all college -0.187397 +-0.8465946 Japanese college -0.27955192 +-1.9184252 If college -0.187397 +-1.8625033 believe college -0.048957378 +-2.0997076 fs college -0.048957378 +-1.3595473 during college -0.048957378 +-1.6819744 enjoy college -0.048957378 +-1.8066765 support college -0.048957378 +-1.889248 than college -0.048957378 +-1.5330975 enter college -0.048957378 +-1.7332721 my college -0.187397 +-1.4831343 afford college -0.048957378 +-1.4205079 poor college -0.048957378 +-2.0045042 young college -0.048957378 +-0.68533725 non college -0.048957378 +-1.1447526 enables college -0.048957378 +-1.8175691 think college -0.048957378 +-1.1447526 reach college -0.048957378 +-0.9774241 easy college -0.048957378 +-1.4793464 All college -0.048957378 +-0.68533725 inside college -0.048957378 +-1.8951879 his college -0.048957378 +-0.68533725 whereby college -0.048957378 +-1.1488446 among college -0.048957378 +-1.261102 including college -0.048957378 +-0.9774241 finish college -0.048957378 +-0.9774241 post college -0.048957378 +-1.1447526 At college -0.048957378 +-1.1447526 leaving college -0.048957378 +-1.2651135 attending college -0.048957378 +-0.9774241 throughout college -0.048957378 +-0.68533725 After college -0.048957378 +-0.68533725 Advising college -0.048957378 +-2.052381 I have -0.1722646 +-2.0344038 can have -0.048957378 +-2.2846544 that have -0.04895735 +-1.14785 will have -0.1747503 +-2.6225338 , have -0.048957378 +-1.3979636 not have -0.22476706 +-1.7684333 cases have -0.048957378 +-2.2219865 and have -0.04895735 +-1.6900849 to have -0.56148124 +-1.4030249 students have -0.067588836 +-2.194169 also have -0.048957378 +-2.04328 -RRB- have -0.048957378 +-1.9675894 then have -0.048957378 +-1.2647346 may have -0.048957378 +-1.6201711 should have -0.20495787 +-1.613422 only have -0.048957378 +-2.5056226 college have -0.048957378 +-1.0974215 they have -0.06039661 +-1.234272 you have -0.04895735 +-1.5498656 today have -0.048957378 +-2.3402712 or have -0.048957378 +-1.6349425 employers have -0.048957378 +-2.221475 skills have -0.048957378 +-1.160794 who have -0.048957378 +-1.5922925 There have -0.048957378 +-1.3119942 ft have -0.11268411 +-1.0498364 others have -0.20946135 +-2.001445 parents have -0.048957378 +-2.2464046 people have -0.048957378 +-1.8238463 They have -0.04895735 +-1.270545 fll have -0.048957378 +-2.0455916 could have -0.048957378 +-1.9505697 must have -0.048957378 +-1.4370916 might have -0.048957378 +-1.4950674 workers have -0.048957378 +-0.98231244 similarly have -0.048957378 +-1.2110279 would have -0.048957378 +-1.6999441 always have -0.048957378 +-1.7875407 Japan have -0.048957378 +-1.3641227 clubs have -0.048957378 +-1.6596003 we have -0.187397 +-1.270545 thereby have -0.048957378 +-0.6878254 meaning have -0.048957378 +-0.98231244 players have -0.048957378 +-0.6878254 You have -0.048957378 +-0.6878254 woman have -0.048957378 +-0.6878254 hardly have -0.048957378 +-0.6878254 US have -0.048957378 +-2.7708678 Time -0.13618872 +-2.0782373 with money -0.048957378 +-1.9039007 for money -0.20946135 +-2.6958623 is money -0.048957378 +-1.8136957 the money -0.20025812 +-2.9160364 , money -0.048957378 +-1.9115413 of money -0.3294234 +-2.619987 their money -0.048957378 +-1.828914 little money -0.048957378 +-2.5175261 as money -0.048957378 +-2.1365373 have money -0.048957378 +-0.71621805 extra money -0.08422062 +-0.6190695 making money -0.27955192 +-2.168165 some money -0.048957378 +-1.0692556 enough money -0.04895735 +-1.0920932 need money -0.048957378 +-1.0530574 earn money -0.08422062 +-2.1729774 what money -0.048957378 +-2.1689255 get money -0.048957378 +-1.4451659 make money -0.048957378 +-1.003105 earning money -0.048957378 +-1.3428849 own money -0.187397 +-1.810685 without money -0.048957378 +-1.2780342 becomes money -0.048957378 +-0.4077425 pocket money -0.13618872 +-1.7754562 my money -0.048957378 +-1.8164622 getting money -0.048957378 +-0.6897705 saved money -0.048957378 +-1.5633703 between money -0.048957378 +-1.5118419 waste money -0.048957378 +-0.6190695 save money -0.11268411 +-1.5135939 choose money -0.048957378 +-1.0025412 spending money -0.048957378 +-0.6897705 Extra money -0.048957378 +-0.4077425 prize money -0.048957378 +-2.1570249 this they -0.048957378 +-2.334775 be they -0.048957378 +-1.1097977 that they -0.15780298 +-2.3550577 job they -0.048957378 +-1.7556858 , they -0.1765346 +-2.3234158 not they -0.048957378 +-1.7466211 cases they -0.048957378 +-1.8984058 employment they -0.048957378 +-1.8099457 and they -0.048957378 +-1.76665 course they -0.048957378 +-2.046208 studying they -0.048957378 +-2.208551 experience they -0.048957378 +-1.3628817 as they -0.0796529 +-1.5363333 time they -0.1722646 +-2.111806 school they -0.048957378 +-0.5066341 if they -0.26353598 +-1.46896 then they -0.048957378 +-1.9888595 but they -0.048957378 +-1.7504606 necessary they -0.048957378 +-2.4442205 college they -0.048957378 +-1.8885397 money they -0.04895735 +-1.0223186 so they -0.1722646 +-1.7338195 : they -0.048957378 +-2.260678 or they -0.048957378 +-1.8242283 career they -0.048957378 +-1.809846 ; they -0.048957378 +-1.3508635 industry they -0.048957378 +-1.4364922 skills they -0.04895735 +-1.8018153 people they -0.048957378 +-0.6109376 when they -0.18543062 +-2.1597927 do they -0.048957378 +-0.94445074 while they -0.3855526 +-0.9779645 theories they -0.048957378 +-0.9211548 what they -0.15838256 +-1.0117494 after they -0.11268411 +-2.18483 all they -0.048957378 +-0.7052752 because they -0.25144583 +-1.9231659 If they -0.048957378 +-1.3546883 whatever they -0.048957378 +-0.68561304 Neither they -0.048957378 +-0.9779645 least they -0.048957378 +-1.2719235 which they -0.13618872 +-0.41538513 once they -0.048957378 +-1.5189977 world they -0.187397 +-0.74999696 where they -0.099168696 +-1.7661228 find they -0.048957378 +-0.9779645 stressful they -0.048957378 +-1.8934973 than they -0.048957378 +-0.60899943 before they -0.099168696 +-1.4810653 until they -0.048957378 +-0.9779645 classrooms they -0.048957378 +-1.0434618 since they -0.048957378 +-0.9779645 argue they -0.048957378 +-0.68561304 Maybe they -0.04895735 +-1.616619 week they -0.048957378 +-1.1455474 sure they -0.048957378 +-0.68561304 information they -0.048957378 +-0.9779645 connections they -0.048957378 +-1.2621411 When they -0.048957378 +-0.9779645 doubt they -0.048957378 +-1.4220086 everything they -0.048957378 +-1.3546883 materials they -0.048957378 +-0.9779645 dedication they -0.048957378 +-0.9779645 anyway they -0.048957378 +-2.6986191 I say -0.048957378 +-3.3187766 to say -0.048957378 +-2.8121924 they say -0.048957378 +-1.7647249 To say -0.048957378 +-2.4508684 people say -0.048957378 +-2.323799 would say -0.048957378 +-1.8634427 And -0.06505798 +-1.7612287 that you -0.09038658 +-2.3783557 , you -0.11268411 +-2.8349316 and you -0.048957378 +-1.7039633 if you -0.048957378 +-1.6259912 benefit you -0.048957378 +-1.8807672 ft you -0.048957378 +-1.579988 gives you -0.048957378 +-2.3663857 people you -0.048957378 +-1.5197701 see you -0.048957378 +-1.4117362 when you -0.04895735 +-0.6915846 practices you -0.048957378 +-1.88691 classes you -0.048957378 +-2.0400605 If you -0.187397 +-1.3805234 whatever you -0.048957378 +-2.21047 get you -0.048957378 +-1.5750064 teach you -0.048957378 +-1.5262699 once you -0.048957378 +-1.5197701 until you -0.048957378 +-0.98974967 definition you -0.048957378 +-0.6915846 Thank you -0.048957378 +-0.98974967 everywhere you -0.048957378 +-2.9032245 that extra -0.048957378 +-2.5098817 the extra -0.27955192 +-3.087084 of extra -0.048957378 +-2.8794727 their extra -0.048957378 +-1.8775834 little extra -0.04895735 +-1.7936063 gain extra -0.048957378 +-2.671741 on extra -0.048957378 +-2.323302 The extra -0.048957378 +-2.411885 an extra -0.048957378 +-1.1616051 some extra -0.11268411 +-3.340304 , making -0.048957378 +-3.0415518 in making -0.048957378 +-3.1476786 of making -0.048957378 +-3.2905889 to making -0.048957378 +-2.0869105 then making -0.048957378 +-2.8200238 are making -0.048957378 +-2.0182273 about making -0.048957378 +-1.7919601 with some -0.04895735 +-1.9087481 for some -0.048957378 +-2.9581296 , some -0.048957378 +-1.4283642 In some -0.187397 +-1.9911622 in some -0.25144583 +-2.8156636 of some -0.048957378 +-1.7263924 For some -0.048957378 +-2.107857 has some -0.048957378 +-2.546186 on some -0.048957378 +-1.5166097 making some -0.048957378 +-2.56842 are some -0.13618872 +-2.0354357 spend some -0.048957378 +-1.9659947 need some -0.048957378 +-1.5654893 put some -0.048957378 +-0.9872564 putting some -0.048957378 +-1.6164061 earn some -0.048957378 +-1.6583185 use some -0.048957378 +-1.888793 provide some -0.048957378 +-1.5654893 teach some -0.048957378 +-2.2427747 fs some -0.048957378 +-1.8188589 find some -0.048957378 +-1.8174039 without some -0.048957378 +-1.6145966 just some -0.048957378 +-1.5166097 means some -0.048957378 +-1.2801979 practice some -0.048957378 +-1.8732165 think some -0.048957378 +-0.6903279 clean some -0.048957378 +-1.6145966 why some -0.048957378 +-0.9872564 avoid some -0.048957378 +-1.5113788 lose some -0.048957378 +-2.5752501 be so -0.048957378 +-2.5561922 job so -0.048957378 +-2.1707535 is so -0.13618872 +-2.2438226 , so -0.048957378 +-2.1201158 not so -0.048957378 +-2.3051422 and so -0.04895735 +-2.4268644 working so -0.048957378 +-2.282037 also so -0.048957378 +-2.153714 have so -0.048957378 +-1.7911842 And so -0.048957378 +-2.0121222 become so -0.048957378 +-2.2291994 are so -0.048957378 +-1.5702835 learned so -0.048957378 +-1.9550152 doing so -0.048957378 +-1.7752349 do so -0.048957378 +-2.0235035 because so -0.048957378 +-2.3353882 them so -0.048957378 +-2.1769383 education so -0.048957378 +-0.69074637 structure so -0.048957378 +-1.8979086 years so -0.048957378 +-1.8178241 was so -0.048957378 +-1.7405639 go so -0.048957378 +-1.162232 fun so -0.048957378 +-1.6179397 why so -0.048957378 +-1.6244801 later so -0.048957378 +-0.69074637 chores so -0.048957378 +-2.0801263 smoking so -0.048957378 +-2.6653895 with bad -0.048957378 +-3.1138833 a bad -0.048957378 +-3.160214 the bad -0.048957378 +-2.6787992 as bad -0.048957378 +-2.260646 so bad -0.048957378 +-1.8245997 A bad -0.048957378 +-1.293329 taste bad -0.187397 +-1.5885828 develop bad -0.048957378 +-0.6932658 smells bad -0.048957378 +-2.5476494 the following -0.13332285 +-2.3608074 The following -0.048957378 +-2.223746 these following -0.048957378 +-1.5679039 world following -0.048957378 +-1.6325631 reasons : -0.048957378 +-2.8145835 to : -0.048957378 +-2.6559741 as : -0.048957378 +-2.6871977 college : -0.048957378 +-1.3861309 particular : -0.048957378 +-1.5320415 purpose : -0.048957378 +-1.1667371 fortunate : -0.048957378 +-1.1667371 budget : -0.048957378 +-1.5854788 spending : -0.048957378 +-1.6350522 me : -0.048957378 +-1.4629681 habits : -0.048957378 +-0.69284487 counter-argument : -0.048957378 +-1.8338767 : - -0.048957378 +-2.2918823 part - -0.048957378 +-2.0090806 society - -0.048957378 +-2.0472798 smoke - -0.048957378 +-2.0595405 To -0.048957378 +-1.8357562 : To -0.048957378 +-1.2950919 - To -0.048957378 +-2.6954308 will begin -0.048957378 +-2.9442508 students begin -0.048957378 +-1.1858512 To begin -0.048957378 +-1.3905429 likely begin -0.048957378 +-1.1696702 difficulties begin -0.048957378 +-3.340304 , today -0.048957378 +-2.0174642 In today -0.048957378 +-2.5359309 in today -0.187397 +-3.1476786 of today -0.048957378 +-1.6857617 employers today -0.048957378 +-1.6409096 me today -0.048957378 +-0.69354665 happing today -0.048957378 +-1.5202303 it 's -0.099168696 +-2.9155905 that 's -0.048957378 +-1.7359056 student 's -0.04895735 +-0.9312699 one 's -0.13332285 +-1.5885828 today 's -0.048957378 +-2.4326057 people 's -0.048957378 +-1.8797346 It 's -0.048957378 +-1.2917377 That 's -0.048957378 +-1.7239105 person 's -0.048957378 +-2.2036474 job market -0.048957378 +-2.0130277 employment market -0.048957378 +-2.640829 can become -0.048957378 +-2.6514506 will become -0.048957378 +-3.2669423 , become -0.048957378 +-2.3865013 and become -0.04895735 +-1.8784709 to become -0.2432137 +-0.73677033 has become -0.048957378 +-2.7762349 they become -0.048957378 +-1.9026326 They become -0.04895735 +-1.586929 perhaps become -0.048957378 +-0.6931254 workloads become -0.048957378 +-3.2358162 a highly -0.048957378 +-3.0808864 is highly -0.048957378 +-2.0807586 become highly -0.048957378 +-2.332246 would highly -0.048957378 +-1.295562 highly competitive -0.048957378 +-2.415659 more competitive -0.048957378 +-2.4042468 for most -0.048957378 +-2.8053942 that most -0.048957378 +-2.8470347 is most -0.048957378 +-2.4587548 the most -0.099168696 +-2.384211 , most -0.20946135 +-1.9840088 In most -0.048957378 +-2.1947784 in most -0.20946135 +-2.5822704 and most -0.13618872 +-2.6012638 of most -0.048957378 +-2.7633367 their most -0.048957378 +-2.272496 The most -0.048957378 +-2.0615287 spend most -0.048957378 +-1.580486 entering most -0.048957378 +-2.157655 which most -0.048957378 +-2.2725682 would most -0.048957378 +-2.1596465 learn most -0.048957378 +-1.5216572 So most -0.048957378 +-1.2862043 Perhaps most -0.048957378 +-1.2862043 its most -0.048957378 +-2.213457 a degree -0.099168696 +-2.9430885 their degree -0.048957378 +-2.7264938 college degree -0.048957378 +-2.6663196 or degree -0.048957378 +-1.5351577 business degree -0.048957378 +-0.69368714 four-year degree -0.048957378 +-2.7198546 it does -0.048957378 +-1.6409096 degree does -0.048957378 +-1.5906644 benefits does -0.048957378 +-0.99365675 alternative does -0.048957378 +-2.2694683 what does -0.048957378 +-0.69354665 exactly does -0.048957378 +-2.0398965 smoke does -0.048957378 +-2.7659814 not guarantee -0.048957378 +-1.9456341 no guarantee -0.048957378 +-2.7810078 or -0.048957378 +-2.3264356 it or -0.048957378 +-1.7787709 job or -0.11268411 +-1.6602305 graduation or -0.048957378 +-1.484383 , or -0.07191783 +-2.0958743 study or -0.048957378 +-2.0187666 studying or -0.048957378 +-1.7533882 little or -0.048957378 +-2.2429714 working or -0.048957378 +-1.6986228 restaurant or -0.048957378 +-2.0238082 studies or -0.048957378 +-2.3727958 time or -0.048957378 +-2.3568823 work or -0.048957378 +-1.4597224 -LRB- or -0.048957378 +-2.1438668 may or -0.048957378 +-1.6797485 tuition or -0.048957378 +-2.400025 college or -0.048957378 +-1.8751298 well or -0.048957378 +-0.6838237 newspapers or -0.048957378 +-1.4167714 whether or -0.048957378 +-1.2554307 organization or -0.048957378 +-1.7265065 important or -0.048957378 +-1.5231602 learned or -0.048957378 +-1.6716728 food or -0.048957378 +-1.1813785 parents or -0.099168696 +-1.4123452 major or -0.048957378 +-1.7976108 classes or -0.048957378 +-0.97446364 fields or -0.048957378 +-1.4167714 club or -0.048957378 +-1.8213432 university or -0.048957378 +-1.5189182 expenses or -0.048957378 +-0.6838237 grants or -0.048957378 +-1.7475386 family or -0.048957378 +-2.064882 education or -0.048957378 +-1.9252164 learning or -0.048957378 +-0.6838237 time-consuming or -0.048957378 +-0.97446364 organizations or -0.048957378 +-0.6838237 son or -0.048957378 +-1.2554307 him or -0.187397 +-1.2554307 government or -0.048957378 +-1.2554307 courses or -0.048957378 +-0.6838237 book or -0.048957378 +-1.3426465 rent or -0.048957378 +-0.6838237 toilets or -0.048957378 +-1.347171 law or -0.048957378 +-1.6979965 days or -0.048957378 +-1.3426465 homework or -0.048957378 +-0.6838237 teens or -0.048957378 +-1.2554307 positive or -0.048957378 +-0.71159285 class or -0.048957378 +-0.5786866 company or -0.048957378 +-0.97446364 references or -0.048957378 +-0.8432839 he or -0.27955192 +-0.6838237 likes or -0.048957378 +-0.97446364 finance or -0.048957378 +-0.6838237 design or -0.048957378 +-0.6838237 sports or -0.048957378 +-0.6838237 sooner or -0.048957378 +-1.4149343 his or -0.187397 +-0.6838237 delay or -0.048957378 +-0.97446364 schools or -0.048957378 +-0.6838237 partly or -0.048957378 +-0.97446364 lab or -0.048957378 +-0.6838237 house or -0.048957378 +-1.3426465 employees or -0.048957378 +-0.6838237 hotels or -0.048957378 +-0.97446364 Whether or -0.048957378 +-0.6838237 men or -0.048957378 +-0.40535554 experiments or -0.048957378 +-0.97446364 bars or -0.048957378 +-0.97446364 diploma or -0.048957378 +-1.1404077 man or -0.048957378 +-0.6838237 stores or -0.048957378 +-0.6838237 sons or -0.048957378 +-1.3426465 eat or -0.048957378 +-0.97446364 worries or -0.048957378 +-0.6838237 30 or -0.048957378 +-3.0111382 that pays -0.048957378 +-1.5378767 generally pays -0.048957378 +-2.5958927 with well -0.048957378 +-2.3562336 this well -0.048957378 +-2.6293688 it well -0.048957378 +-2.6613982 be well -0.048957378 +-2.9553328 a well -0.048957378 +-2.8592598 is well -0.048957378 +-1.8125416 cases well -0.048957378 +-1.0680895 as well -0.53915703 +-2.225849 so well -0.048957378 +-0.99058396 pays well -0.048957378 +-1.3823845 As well -0.048957378 +-1.9749511 doing well -0.048957378 +-1.7859211 do well -0.048957378 +-0.99058396 works well -0.048957378 +-2.1286578 could well -0.048957378 +-0.99058396 function well -0.048957378 +-1.8083847 A well -0.048957378 +-1.45804 tasks well -0.048957378 +-2.328378 Most -0.09038658 +-3.4204583 the advertisements -0.048957378 +-3.0111382 that appear -0.048957378 +-2.3941946 may appear -0.048957378 +-3.4204583 the newspapers -0.048957378 +-2.7364948 on on-line -0.048957378 +-0.6942867 on-line portals -0.048957378 +-3.4000094 the preferred -0.048957378 +-2.1200047 much preferred -0.048957378 +-1.4711709 pursuing candidates -0.048957378 +-0.99501497 preferred candidates -0.048957378 +-2.2634602 , even -0.1722646 +-2.6035779 not even -0.048957378 +-1.8125416 cases even -0.048957378 +-2.3463118 and even -0.04895735 +-2.9576278 of even -0.048957378 +-3.0623753 to even -0.048957378 +-2.052564 -LRB- even -0.048957378 +-1.79566 may even -0.048957378 +-2.7121813 they even -0.048957378 +-1.7906737 or even -0.048957378 +-0.99058396 candidates even -0.048957378 +-2.0639803 spend even -0.048957378 +-2.0894055 up even -0.048957378 +-1.627021 But even -0.048957378 +-1.5782256 perhaps even -0.048957378 +-1.7886138 want even -0.048957378 +-1.1642387 fulfill even -0.048957378 +-0.99058396 possibly even -0.048957378 +-3.0452757 for entry -0.048957378 +-2.4277496 this level -0.048957378 +-3.1828704 a level -0.048957378 +-2.8272688 the level -0.187397 +-1.8319273 necessary level -0.048957378 +-0.69368714 entry level -0.048957378 +-1.16925 energy level -0.048957378 +-1.9169102 part-time positions -0.048957378 +-2.1861217 such positions -0.048957378 +-1.4689857 level positions -0.048957378 +-1.8331021 high positions -0.048957378 +-1.2934115 labor positions -0.048957378 +-0.9939372 treasurer positions -0.048957378 +-1.5985401 reasons are -0.048957378 +-1.6295338 that are -0.048957378 +-2.0397775 job are -0.048957378 +-2.27138 student are -0.048957378 +-2.5350275 , are -0.048957378 +-1.9472449 and are -0.048957378 +-1.1913064 there are -0.04895735 +-1.1553122 students are -0.0942272 +-1.5524418 jobs are -0.048957378 +-0.97715414 effectively are -0.048957378 +-1.9818754 but are -0.048957378 +-2.066927 college are -0.048957378 +-0.99709177 they are -0.11830317 +-0.8995415 you are -0.08422062 +-1.5753517 following are -0.048957378 +-1.5321846 today are -0.048957378 +-1.4823278 positions are -0.048957378 +-1.4350866 skills are -0.04895735 +-1.6096346 These are -0.048957378 +-0.83870685 who are -0.04895735 +-1.8426574 themselves are -0.048957378 +-1.5716527 There are -0.2603527 +-1.4905598 parents are -0.04895735 +-1.8760486 doing are -0.048957378 +-1.7976505 people are -0.048957378 +-1.7892362 They are -0.048957378 +-1.2605834 People are -0.048957378 +-1.8122691 classes are -0.048957378 +-0.97715414 Workers are -0.048957378 +-0.68519944 Evaluations are -0.048957378 +-0.68519944 Internships are -0.048957378 +-1.8895272 activities are -0.048957378 +-1.7114832 responsibilities are -0.048957378 +-1.8361113 university are -0.048957378 +-1.0436594 finances are -0.048957378 +-1.575384 which are -0.048957378 +-1.5284172 expenses are -0.048957378 +-2.0214574 these are -0.048957378 +-1.7426505 customers are -0.048957378 +-1.5716527 force are -0.048957378 +-0.68519944 campuses are -0.048957378 +-0.68519944 discovery are -0.048957378 +-1.2605834 factors are -0.048957378 +-1.4823278 Some are -0.048957378 +-1.1443559 offers are -0.048957378 +-1.1443559 ethics are -0.048957378 +-1.1261823 lives are -0.048957378 +-1.8888688 things are -0.048957378 +-1.7583526 Japan are -0.048957378 +-1.7256835 hours are -0.048957378 +-1.7109224 days are -0.048957378 +-0.68519944 never-the-less are -0.048957378 +-1.5284172 companies are -0.048957378 +-0.97715414 reports are -0.048957378 +-1.1539313 we are -0.048957378 +-0.97715414 classmates are -0.048957378 +-1.4862003 children are -0.048957378 +-0.68519944 shoppers are -0.048957378 +-0.68519944 averages are -0.048957378 +-1.2605834 weekends are -0.048957378 +-0.68519944 superiors are -0.048957378 +-0.68519944 rejected are -0.048957378 +-0.68519944 signs are -0.048957378 +-1.985316 smoking are -0.048957378 +-1.9137187 smoke are -0.048957378 +-0.97715414 Restaurants are -0.048957378 +-0.68519944 unknowingly are -0.048957378 +-2.4314516 As -0.099168696 +-3.4931862 , As -0.048957378 +-1.6903851 several ways -0.048957378 +-2.7730234 have ways -0.048957378 +-2.223746 these ways -0.048957378 +-2.0831351 out ways -0.048957378 +-2.200288 a career -0.048957378 +-3.0686293 of career -0.048957378 +-2.864925 their career -0.048957378 +-0.95117176 chosen career -0.048957378 +-2.7644556 are career -0.048957378 +-1.4443519 future career -0.048957378 +-1.464479 different career -0.048957378 +-1.1671549 successful career -0.048957378 +-2.3515894 my career -0.048957378 +-1.1671549 whose career -0.048957378 +-0.6929851 targeted career -0.048957378 +-1.9226208 career oriented -0.048957378 +-2.4081247 by internships -0.048957378 +-3.21812 of internships -0.048957378 +-2.7197275 as internships -0.048957378 +-1.6444623 offer internships -0.048957378 +-3.2085369 a practical -0.048957378 +-2.0431418 valuable practical -0.048957378 +-3.3971481 , practical -0.048957378 +-1.5351007 see practical -0.048957378 +-1.592756 develop practical -0.048957378 +-2.3354475 The experiences -0.048957378 +-1.388153 practical experiences -0.048957378 +-1.683961 These experiences -0.048957378 +-1.4662849 educational experiences -0.048957378 +-2.1316676 social experiences -0.048957378 +-2.0727339 learning experiences -0.048957378 +-1.388153 invaluable experiences -0.048957378 +-1.292295 Those experiences -0.048957378 +-2.328378 Another -0.13618872 +-1.2958019 Another option -0.048957378 +-2.0134587 Part-time -0.26607528 +-2.7126353 job offer -0.048957378 +-2.698859 not offer -0.048957378 +-3.0886452 and offer -0.048957378 +-2.852691 to offer -0.048957378 +-2.472681 jobs offer -0.048957378 +-2.3685565 may offer -0.048957378 +-1.9072032 They offer -0.048957378 +-2.1602967 could offer -0.048957378 +-1.106328 several benefits -0.187397 +-2.802924 the benefits -0.187397 +-2.3429945 many benefits -0.048957378 +-1.5891323 potential benefits -0.048957378 +-1.9544258 other benefits -0.048957378 +-1.388153 immediate benefits -0.048957378 +-1.5326759 Some benefits -0.048957378 +-0.9933765 principle benefits -0.048957378 +-3.644728 Primarily -0.048957378 +-1.8144643 will benefit -0.04895735 +-2.167287 has benefit -0.048957378 +-2.3863747 also benefit -0.048957378 +-1.9993248 financial benefit -0.048957378 +-1.2928529 another benefit -0.048957378 +-1.6409096 professional benefit -0.048957378 +-1.467118 added benefit -0.048957378 +-3.644728 Apart -0.048957378 +-1.3940754 a part -1.0148947 +-2.6958623 is part -0.048957378 +-2.2421405 the part -0.25144583 +-2.1395745 , part -0.3855526 +-2.4732556 and part -0.187397 +-2.7770655 of part -0.048957378 +-2.3063238 their part -0.048957378 +-2.1170542 studying part -0.048957378 +-1.4326391 working part -0.50140065 +-2.7045262 students part -0.048957378 +-1.8538234 work part -0.27955192 +-1.8087573 being part -0.048957378 +-1.6627073 have part -0.5805819 +-2.0244231 even part -0.048957378 +-2.2687714 important part -0.187397 +-1.9226633 through part -0.048957378 +-2.3882751 at part -0.048957378 +-1.9401183 doing part -0.048957378 +-1.7848566 your part -0.048957378 +-1.8990567 take part -0.048957378 +-2.1689255 get part -0.048957378 +-1.2384373 A part -0.50140065 +-2.115686 these part -0.048957378 +-1.3725495 particular part -0.048957378 +-0.6897705 consume part -0.048957378 +-1.5097663 Working part -0.048957378 +-0.6897705 pointless part -0.048957378 +-1.3725495 essential part -0.048957378 +-0.6897705 usual part -0.048957378 +-1.8206525 had part -0.048957378 +-0.6897705 indispensable part -0.048957378 +-1.1576457 indeed part -0.048957378 +-0.9861529 distracting part -0.048957378 +-0.9861529 Any part -0.048957378 +-2.013934 employment lays -0.048957378 +-2.878185 the foundation -0.048957378 +-1.8055046 real foundation -0.048957378 +-3.2799814 and history -0.048957378 +-2.7940712 work history -0.048957378 +-2.4358826 for future -0.048957378 +-2.3242965 the future -0.12183642 +-1.9122841 their future -0.14912641 +-2.0743585 -LRB- future -0.048957378 +-1.808525 my future -0.048957378 +-1.4622438 his future -0.048957378 +-1.5292882 her future -0.048957378 +-1.1671549 entire future -0.048957378 +-1.1671549 desired future -0.048957378 +-0.6929851 brighter future -0.048957378 +-0.6929851 tolerate future -0.048957378 +-2.1588707 very carefully -0.048957378 +-2.5494218 should carefully -0.048957378 +-1.5938057 By carefully -0.048957378 +-1.1700908 weigh carefully -0.048957378 +-2.6954308 will consider -0.048957378 +-2.250529 to consider -0.1722646 +-1.293971 carefully consider -0.048957378 +-2.0448897 must consider -0.048957378 +-1.5353987 universities consider -0.048957378 +-3.2979217 and select -0.048957378 +-2.031715 In order -0.048957378 +-1.4732497 in order -0.8438233 +-2.045974 valuable skill -0.048957378 +-2.2828684 one skill -0.048957378 +-2.7759395 work skill -0.048957378 +-2.4027975 life skill -0.048957378 +-2.1737337 By -0.0796529 +-1.5956279 f By -0.048957378 +-1.2958019 carefully selecting -0.048957378 +-3.4204583 the gaps -0.048957378 +-2.6489563 with academic -0.048957378 +-2.197615 the academic -0.1722646 +-2.9850137 in academic -0.048957378 +-3.1932025 to academic -0.048957378 +-1.778757 their academic -0.048957378 +-2.66345 as academic -0.048957378 +-2.6645222 on academic -0.048957378 +-2.2063816 only academic -0.048957378 +-1.9465262 our academic -0.048957378 +-2.0258944 his academic -0.048957378 +-1.2906253 purely academic -0.048957378 +-3.2676826 of qualification -0.048957378 +-1.9211627 academic qualification -0.048957378 +-2.880715 be minimized -0.048957378 +-2.6210346 not ; -0.048957378 +-2.1673005 studies ; -0.048957378 +-2.2506866 school ; -0.048957378 +-2.4787517 money ; -0.048957378 +-2.000383 future ; -0.048957378 +-0.6922843 minimized ; -0.048957378 +-1.9843484 society ; -0.048957378 +-1.8540891 living ; -0.048957378 +-1.2878569 arguments ; -0.048957378 +-2.2060153 education ; -0.048957378 +-2.1076722 world ; -0.048957378 +-1.3836298 campus ; -0.048957378 +-0.9911411 suffer ; -0.048957378 +-1.6304226 independence ; -0.048957378 +-1.1650699 basis ; -0.048957378 +-0.6922843 expertise ; -0.048957378 +-2.710804 it makes -0.048957378 +-2.596196 student makes -0.048957378 +-2.6842268 and makes -0.048957378 +-2.4195924 experience makes -0.048957378 +-1.8548925 environment makes -0.048957378 +-2.0984945 This makes -0.048957378 +-1.8818965 It makes -0.048957378 +-0.6934062 indirectly makes -0.048957378 +-3.4000094 the candidate -0.048957378 +-1.4711709 poor candidate -0.048957378 +-1.7283275 be more -0.048957378 +-2.0377142 having more -0.048957378 +-2.4830453 a more -0.048957378 +-2.1342297 is more -0.13618872 +-2.70885 of more -0.048957378 +-1.8138318 little more -0.048957378 +-2.5265045 time more -0.048957378 +-1.619328 agree more -0.048957378 +-2.1177042 have more -0.048957378 +-2.372181 money more -0.048957378 +-1.9725738 become more -0.048957378 +-2.37433 or more -0.048957378 +-1.5047102 even more -0.048957378 +-1.7527298 are more -0.048957378 +-1.601476 offer more -0.048957378 +-0.9839542 candidate more -0.048957378 +-1.1786582 spend more -0.13618872 +-1.5627428 gives more -0.048957378 +-1.8786585 other more -0.048957378 +-1.8830991 take more -0.048957378 +-1.6806693 know more -0.048957378 +-1.6643432 opportunity more -0.048957378 +-1.5556021 done more -0.048957378 +-2.1896749 fs more -0.048957378 +-1.2737389 lot more -0.048957378 +-2.2612493 life more -0.048957378 +-1.6039462 finances more -0.048957378 +-1.4414171 provides more -0.048957378 +-0.9839542 contribute more -0.187397 +-1.3677142 campus more -0.048957378 +-1.5046054 much more -0.048957378 +-1.7906424 was more -0.048957378 +-1.3677142 concentrate more -0.048957378 +-1.5990196 But more -0.048957378 +-1.4388686 far more -0.048957378 +-1.7629304 want more -0.048957378 +-1.5531157 made more -0.048957378 +-1.2763526 concentration more -0.048957378 +-0.68865794 assure more -0.048957378 +-0.9839542 bring more -0.048957378 +-1.5556021 spending more -0.048957378 +-0.9839542 seeing more -0.048957378 +-2.4184146 more attractive -0.048957378 +-3.1828704 a potential -0.048957378 +-2.5346918 the potential -0.04895735 +-2.9430885 their potential -0.048957378 +-2.3479426 The potential -0.048957378 +-1.2934115 cash potential -0.048957378 +-1.7684674 full potential -0.048957378 +-2.9032245 that employers -0.048957378 +-3.1340673 the employers -0.048957378 +-3.2669423 , employers -0.048957378 +-2.332602 many employers -0.048957378 +-2.671741 on employers -0.048957378 +-2.1315336 -RRB- employers -0.048957378 +-2.256639 so employers -0.048957378 +-1.586929 potential employers -0.048957378 +-1.7584107 Many employers -0.048957378 +-0.6931254 Potential employers -0.048957378 +-2.7198546 it helps -0.048957378 +-2.4869003 part-time helps -0.048957378 +-2.1890433 job helps -0.13618872 +-2.0028858 employment helps -0.048957378 +-2.3863747 also helps -0.048957378 +-1.3888488 industry helps -0.048957378 +-0.99365675 background helps -0.048957378 +-2.7059574 be better -0.048957378 +-1.9648439 a better -0.13332285 +-2.9119232 is better -0.048957378 +-3.042853 the better -0.048957378 +-2.5496442 student better -0.048957378 +-2.119264 will better -0.13618872 +-1.384001 developed better -0.048957378 +-1.4682547 do better -0.13618872 +-1.8155167 A better -0.048957378 +-1.2029971 much better -0.04895735 +-1.4621929 effect better -0.048957378 +-1.4613192 far better -0.048957378 +-1.3864982 focused better -0.048957378 +-0.99169886 got better -0.048957378 +-2.6824691 with understanding -0.048957378 +-2.9652722 for understanding -0.048957378 +-2.6947107 as understanding -0.048957378 +-2.4320858 an understanding -0.048957378 +-2.0363014 's understanding -0.048957378 +-2.6549041 or understanding -0.048957378 +-2.0178785 better understanding -0.27955192 +-2.9280992 , whether -0.048957378 +-3.1933482 of whether -0.048957378 +-3.3489213 to whether -0.048957378 +-2.0238369 about whether -0.048957378 +-1.2942703 decide whether -0.048957378 +-3.3047535 a corporate -0.048957378 +-3.321535 the organization -0.048957378 +-0.69396824 corporate organization -0.048957378 +-1.6442297 same organization -0.048957378 +-1.1700908 service organization -0.048957378 +-3.2908964 a hotel -0.048957378 +-2.7131412 or hotel -0.048957378 +-2.8665538 the industry -0.04895735 +-2.7024844 or industry -0.048957378 +-0.9947796 hotel industry -0.048957378 +-3.4650414 to evolve -0.048957378 +-2.4833713 with skills -0.048957378 +-1.9634173 valuable skills -0.048957378 +-2.7328155 the skills -0.048957378 +-2.6885328 , skills -0.048957378 +-2.216198 many skills -0.048957378 +-2.2984886 their skills -0.048957378 +-2.21989 study skills -0.048957378 +-1.5592585 Such skills -0.04895735 +-2.1902359 The skills -0.048957378 +-2.521257 are skills -0.048957378 +-1.3691082 practical skills -0.048957378 +-1.3192421 academic skills -0.048957378 +-0.6894921 evolve skills -0.048957378 +-1.6470989 These skills -0.048957378 +-0.4076312 leadership skills -0.048957378 +-0.98560214 interpersonal skills -0.048957378 +-1.5120939 management skills -0.048957378 +-1.2792084 increase skills -0.048957378 +-1.5414642 people skills -0.04895735 +-2.166615 what skills -0.048957378 +-1.0266905 social skills -0.1722646 +-2.2807539 life skills -0.048957378 +-1.7173961 personal skills -0.048957378 +-1.3034346 these skills -0.04895735 +-0.6894921 transferable skills -0.048957378 +-0.6894921 countless skills -0.048957378 +-0.6894921 apparent skills -0.048957378 +-1.3754653 finding skills -0.048957378 +-2.1159384 learn skills -0.048957378 +-1.7557037 workplace skills -0.048957378 +-1.3691082 basic skills -0.048957378 +-1.5080532 obtain skills -0.048957378 +-0.6894921 managerial skills -0.048957378 +-1.3691082 invaluable skills -0.048957378 +-0.6894921 employability skills -0.048957378 +-0.6894921 time-management skills -0.048957378 +-2.5736907 it important -0.048957378 +-1.6911901 is important -0.5213089 +-2.5400212 not important -0.187397 +-2.766682 and important -0.048957378 +-1.0435919 very important -0.3183926 +-2.5629277 as important -0.048957378 +-2.2866995 also important -0.048957378 +-1.5393704 an important -0.08422062 +-2.150213 only important -0.048957378 +-2.197098 so important -0.048957378 +-1.9833093 's important -0.187397 +-1.2292743 most important -0.048957378 +-2.2330184 are important -0.048957378 +-1.2823725 Another important -0.048957378 +-1.799247 more important -0.04895735 +-2.2894611 skills important -0.048957378 +-2.3302038 all important -0.048957378 +-1.5696933 teach important -0.048957378 +-2.2620225 fs important -0.13618872 +-2.1419766 these important -0.048957378 +-2.1411133 learn important -0.048957378 +-1.1609298 extreme important -0.048957378 +-0.69088596 necessarily important -0.048957378 +-1.1609298 personally important -0.048957378 +-1.1609298 indeed important -0.048957378 +-0.69088596 terribly important -0.048957378 +-3.3624787 the employer -0.048957378 +-2.3673859 The employer -0.048957378 +-1.5946901 potential employer -0.048957378 +-2.0134587 These -0.071063936 +-2.3904223 may include -0.048957378 +-2.030206 activities include -0.048957378 +-1.2950919 factors include -0.048957378 +-3.4000094 the leadership -0.048957378 +-1.1708648 include leadership -0.048957378 +-3.4931862 , commitment -0.048957378 +-3.2676826 of commitment -0.048957378 +-3.2908964 a team -0.048957378 +-3.4931862 , team -0.048957378 +-0.995135 team spirit -0.048957378 +-3.1215048 in interpersonal -0.048957378 +-2.749941 and interpersonal -0.187397 +-3.321535 the management -0.048957378 +-1.7073053 time management -0.099168696 +-2.0071845 financial management -0.048957378 +-1.5367397 business management -0.048957378 +-2.5801177 student taking -0.048957378 +-1.8278263 by taking -0.048957378 +-2.6638196 and taking -0.048957378 +-3.087084 of taking -0.048957378 +-2.5388384 from taking -0.048957378 +-2.3818967 students taking -0.048957378 +-2.7776954 are taking -0.048957378 +-1.7590209 enjoy taking -0.048957378 +-1.6373856 just taking -0.048957378 +-1.6373856 since taking -0.048957378 +-1.806743 taking criticism -0.048957378 +-0.6942867 criticism positively -0.048957378 +-3.09512 Besides -0.048957378 +-3.4860332 knowledge -0.048957378 +-2.814926 the knowledge -0.048957378 +-2.385586 more knowledge -0.048957378 +-2.3729742 my knowledge -0.048957378 +-0.69354665 acquiring knowledge -0.048957378 +-1.3888488 invaluable knowledge -0.048957378 +-0.69354665 REAL knowledge -0.048957378 +-2.444559 experience gained -0.048957378 +-2.3741484 skills gained -0.048957378 +-1.5955248 knowledge gained -0.048957378 +-1.7183528 graduation through -0.048957378 +-2.6834726 time through -0.048957378 +-1.1646541 gained through -0.048957378 +-1.9368964 themselves through -0.048957378 +-1.5803545 learned through -0.048957378 +-1.3830067 earned through -0.048957378 +-2.377569 them through -0.048957378 +-1.9823496 society through -0.048957378 +-0.6921442 met through -0.048957378 +-1.2485532 way through -0.048957378 +-1.7527289 go through -0.048957378 +-1.5246081 Working through -0.048957378 +-1.8130639 right through -0.048957378 +-1.2873054 pass through -0.048957378 +-0.99086237 goes through -0.048957378 +-1.1646541 kids through -0.048957378 +-0.6921442 daughters through -0.048957378 +-3.2496762 the educational -0.048957378 +-2.7056403 and educational -0.187397 +-2.9430885 their educational -0.048957378 +-2.2637763 some educational -0.048957378 +-1.3895458 his\/her educational -0.048957378 +-1.3899099 tertiary educational -0.048957378 +-2.022295 student who -0.048957378 +-3.0300741 , who -0.048957378 +-1.4702351 students who -0.105123706 +-1.895541 jobs who -0.048957378 +-2.4962344 or who -0.048957378 +-0.988917 candidates who -0.048957378 +-1.6639643 employers who -0.048957378 +-1.3539498 people who -0.20946135 +-1.2834638 People who -0.048957378 +-1.516955 workers who -0.048957378 +-1.8033544 customers who -0.048957378 +-0.80066806 those who -0.048957378 +-0.988917 worker who -0.048957378 +-1.6198496 families who -0.048957378 +-1.6625155 Students who -0.048957378 +-1.2834638 anyone who -0.048957378 +-1.7037203 person who -0.048957378 +-1.2834638 co-workers who -0.048957378 +-1.2834638 Those who -0.048957378 +-1.2834638 groups who -0.048957378 +-0.988917 individuals who -0.048957378 +-0.69116527 Anyone who -0.048957378 +-1.1617546 patrons who -0.048957378 +-1.2849771 non-smokers who -0.048957378 +-2.720708 not productive -0.048957378 +-2.9430885 their productive -0.048957378 +-2.0743403 become productive -0.048957378 +-2.8351026 are productive -0.048957378 +-2.3916345 more productive -0.048957378 +-1.688467 something productive -0.048957378 +-2.7082691 I developed -0.048957378 +-2.8269253 be developed -0.048957378 +-2.0068414 well developed -0.048957378 +-2.3660924 skills developed -0.048957378 +-1.961792 other developed -0.048957378 +-3.4000094 the overall -0.048957378 +-1.3922309 developed overall -0.048957378 +-0.9947796 overall personality -0.048957378 +-1.7695215 individual personality -0.048957378 +-0.69410884 inclined personality -0.048957378 +-1.2678041 be able -1.182642 +-2.5235014 is able -0.187397 +-2.6883335 not able -0.048957378 +-2.080263 then able -0.048957378 +-2.2144513 only able -0.048957378 +-2.3036728 are able -0.27955192 +-1.7239105 were able -0.048957378 +-1.8528606 was able -0.048957378 +-1.2917377 Being able -0.187397 +-3.4650414 to assimilate -0.048957378 +-2.4119422 for themselves -0.048957378 +-2.6368585 it themselves -0.048957378 +-2.6122186 not themselves -0.048957378 +-2.345771 by themselves -0.048957378 +-1.5246081 positions themselves -0.048957378 +-0.6921442 assimilate themselves -0.048957378 +-1.5235524 see themselves -0.048957378 +-0.99086237 keeping themselves -0.048957378 +-1.5793041 teach themselves -0.048957378 +-1.8918848 support themselves -0.048957378 +-1.3819399 manage themselves -0.048957378 +-0.99086237 manifest themselves -0.048957378 +-0.6921442 situate themselves -0.048957378 +-0.6921442 hurting themselves -0.048957378 +-0.6921442 dedicating themselves -0.048957378 +-0.6921442 exposing themselves -0.048957378 +-0.8048233 smokers themselves -0.048957378 +-2.7927094 and into -0.048957378 +-1.9224067 themselves into -0.048957378 +-1.5718107 put into -0.13618872 +-1.4531673 effort into -0.048957378 +-1.2849771 fit into -0.048957378 +-1.5759982 entering into -0.048957378 +-1.4531673 late into -0.048957378 +-1.1632792 directly into -0.048957378 +-1.5732803 enter into -0.048957378 +-0.988917 classrooms into -0.048957378 +-0.7076173 insight into -0.048957378 +-0.69116527 assimilation into -0.048957378 +-0.69116527 toes into -0.048957378 +-0.69116527 converted into -0.048957378 +-1.2849771 mature into -0.048957378 +-1.2849771 taken into -0.048957378 +-0.988917 turn into -0.048957378 +-0.69116527 input into -0.048957378 +-1.2834638 over into -0.048957378 +-0.988917 women into -0.048957378 +-0.69116527 energies into -0.048957378 +-0.988917 divided into -0.048957378 +-0.69116527 pressured into -0.048957378 +-0.69116527 integrated into -0.048957378 +-2.3965085 this environment -0.048957378 +-2.7687883 the environment -0.048957378 +-2.5071068 working environment -0.048957378 +-2.2680779 school environment -0.048957378 +-2.1980197 work environment -0.048957378 +-1.873338 an environment -0.048957378 +-2.0880997 This environment -0.048957378 +-1.386072 healthy environment -0.048957378 +-0.9925369 smoke-free environment -0.048957378 +-0.9925369 smoky environment -0.048957378 +-0.6929851 smoke-filled environment -0.048957378 +-2.2922049 be learned -0.048957378 +-2.7526727 have learned -0.048957378 +-2.1522174 you learned -0.048957378 +-2.0000803 are learned -0.13618872 +-1.9118223 They learned -0.048957378 +-0.9939372 theory learned -0.048957378 +-2.408415 a position -0.04895735 +-2.489963 part-time position -0.048957378 +-1.5917089 graduate position -0.048957378 +-0.9939372 each position -0.048957378 +-1.16925 manager position -0.048957378 +-0.69368714 humble position -0.048957378 +-2.7267487 will expand -0.048957378 +-3.2102208 and increase -0.048957378 +-2.1736271 could increase -0.048957378 +-1.644695 never increase -0.048957378 +-0.69396824 continual increase -0.048957378 +-3.246793 and abilities -0.048957378 +-2.9977748 their abilities -0.048957378 +-2.0483317 's abilities -0.048957378 +-2.3900154 it at -0.048957378 +-2.3500113 be at -0.048957378 +-2.347194 part-time at -0.048957378 +-2.2968395 is at -0.048957378 +-2.3370936 not at -0.048957378 +-2.5826097 in at -0.048957378 +-1.4147533 study at -0.13618872 +-1.4278648 example at -0.048957378 +-1.7799807 little at -0.048957378 +-2.2175992 experience at -0.048957378 +-1.6079988 working at -0.04895735 +-2.1040056 time at -0.048957378 +-2.234136 jobs at -0.048957378 +-1.6450908 work at -0.048957378 +-1.9357077 then at -0.048957378 +-2.308092 money at -0.048957378 +-1.9892414 most at -0.048957378 +-1.5814797 degree at -0.048957378 +-1.9764487 or at -0.048957378 +-1.5376908 learned at -0.048957378 +-1.1467421 abilities at -0.048957378 +-1.6910964 food at -0.048957378 +-1.9244258 good at -0.048957378 +-0.9448805 while at -0.40449065 +-2.1946948 all at -0.048957378 +-1.9657154 because at -0.048957378 +-1.8329048 provide at -0.048957378 +-1.4242697 usually at -0.048957378 +-1.1467421 staying at -0.048957378 +-1.4242697 late at -0.048957378 +-1.7506284 customers at -0.048957378 +-1.6233808 success at -0.048957378 +-1.7188667 days at -0.048957378 +-1.6199963 week at -0.048957378 +-1.7127738 home at -0.048957378 +-1.4871894 obtain at -0.048957378 +-1.501507 discipline at -0.048957378 +-0.686027 interns at -0.048957378 +-1.2637045 needed at -0.048957378 +-1.4242697 live at -0.048957378 +-0.686027 impressed at -0.048957378 +-1.5849212 spent at -0.048957378 +-1.3564417 public at -0.048957378 +-1.3564417 look at -0.048957378 +-1.5814797 me at -0.048957378 +-1.352782 goal at -0.048957378 +-1.4871894 banned at -0.048957378 +-1.3712845 dealing at -0.048957378 +-0.686027 guides at -0.048957378 +-0.686027 unit at -0.048957378 +-0.686027 course-load at -0.048957378 +-0.97877645 population at -0.048957378 +-0.686027 shocked at -0.048957378 +-0.97877645 perform at -0.048957378 +-0.97877645 solely at -0.048957378 +-0.686027 mesmerized at -0.048957378 +-0.686027 winning at -0.048957378 +-0.686027 Troubles at -0.048957378 +-1.352782 eat at -0.048957378 +-1.9982839 smoking at -0.048957378 +-1.9248092 smoke at -0.048957378 +-2.45335 this same -0.048957378 +-2.2243817 the same -0.2604024 +-2.8176763 time benefiting -0.048957378 +-2.7827518 be financially -0.048957378 +-3.0886452 and financially -0.048957378 +-2.0680156 become financially -0.048957378 +-1.9562662 themselves financially -0.048957378 +-0.6934062 benefiting financially -0.048957378 +-1.8264393 A financially -0.048957378 +-1.292295 Being financially -0.048957378 +-0.6934062 succeed financially -0.048957378 +-1.4990113 There -0.545116 +-2.452609 for two -0.187397 +-2.7428439 have two -0.048957378 +-1.6409096 following two -0.048957378 +-2.8200238 are two -0.187397 +-2.0927913 into two -0.048957378 +-2.211647 these two -0.048957378 +-1.1688302 Reason two -0.048957378 +-2.636479 I first -0.048957378 +-2.6681805 it first -0.048957378 +-2.0159044 the first -0.13332285 +-1.7764169 their first -0.099168696 +-2.3959835 experience first -0.048957378 +-1.7890602 gain first -0.048957378 +-2.305698 The first -0.25144583 +-2.1289778 you first -0.048957378 +-1.8188939 your first -0.048957378 +-2.02572 must first -0.048957378 +-1.8228351 way first -0.048957378 +-1.8055474 my first -0.048957378 +-1.462143 My first -0.048957378 +-2.5346918 the issue -0.13618872 +-2.4390335 an issue -0.048957378 +-2.5278847 money issue -0.048957378 +-2.138197 social issue -0.048957378 +-1.16925 big issue -0.048957378 +-0.69368714 safety issue -0.048957378 +-1.8631858 Many -0.122193575 +-3.4931862 , Many -0.048957378 +-2.9818435 students complain -0.048957378 +-2.156618 I don -0.40449065 +-2.968909 that don -0.048957378 +-1.9445541 they don -0.40449065 +-1.7057605 who don -0.048957378 +-2.276979 we don -0.048957378 +-2.1142278 can ft -0.048957378 +-0.21326932 don ft -0.15221639 +-0.40913635 didn ft -0.048957378 +-0.6932658 wasn ft -0.048957378 +-0.6932658 isn ft -0.048957378 +-0.6932658 couldn ft -0.048957378 +-0.6932658 aren ft -0.048957378 +-0.9930965 won ft -0.048957378 +-0.6932658 Don ft -0.048957378 +-2.0794318 having enough -0.048957378 +-2.8718393 is enough -0.048957378 +-1.8451836 not enough -0.04895735 +-2.1794941 have enough -0.048957378 +-1.8890141 ft enough -0.048957378 +-2.0396717 good enough -0.048957378 +-1.6303203 earn enough -0.048957378 +-0.99086237 stressful enough -0.048957378 +-1.1646541 fortunate enough -0.048957378 +-1.1646541 soon enough -0.048957378 +-1.526558 save enough -0.048957378 +-1.1646541 fast enough -0.048957378 +-0.5841781 old enough -0.187397 +-0.6921442 adequate enough -0.048957378 +-0.6921442 devoting enough -0.048957378 +-0.6921442 enlightened enough -0.048957378 +-0.99086237 justified enough -0.048957378 +-2.6187377 can spend -0.048957378 +-2.6486106 not spend -0.048957378 +-2.6349013 and spend -0.04895735 +-1.9268204 to spend -0.07613316 +-2.3729234 students spend -0.048957378 +-1.6435541 should spend -0.099168696 +-2.751095 they spend -0.048957378 +-1.7537347 To spend -0.048957378 +-2.2472656 who spend -0.048957378 +-2.1441891 could spend -0.048957378 +-2.02572 must spend -0.048957378 +-1.5273678 either spend -0.048957378 +-1.6965742 we spend -0.187397 +-2.3403172 the food -0.048957378 +-2.90795 , food -0.048957378 +-3.1476786 of food -0.048957378 +-3.2905889 to food -0.048957378 +-2.6941473 on food -0.048957378 +-1.1688302 fast food -0.048957378 +-1.1688302 buy food -0.048957378 +-1.6892436 several times -0.048957378 +-3.0728445 in times -0.048957378 +-2.5156648 at times -0.048957378 +-0.9942178 Often times -0.048957378 +-0.9942178 vacation times -0.048957378 +-1.9320052 Having -0.58723795 +-2.710804 it gives -0.048957378 +-2.4838593 part-time gives -0.048957378 +-2.7126353 job gives -0.048957378 +-3.3144484 , gives -0.048957378 +-2.3805175 also gives -0.048957378 +-2.0984945 This gives -0.048957378 +-1.4667772 cards gives -0.048957378 +-0.9933765 dorm gives -0.048957378 +-1.8501036 this need -0.048957378 +-2.707645 the need -0.048957378 +-2.596519 will need -0.048957378 +-2.0085864 often need -0.048957378 +-2.9119923 in need -0.048957378 +-2.8984869 and need -0.048957378 +-1.6250015 students need -0.3194954 +-2.3317747 may need -0.048957378 +-2.238277 they need -0.048957378 +-2.1162357 you need -0.048957378 +-2.2326763 who need -0.048957378 +-1.8890141 ft need -0.048957378 +-1.9116974 no need -0.048957378 +-1.6282278 force need -0.048957378 +-2.2800672 would need -0.048957378 +-2.2293367 we need -0.048957378 +-1.1646541 essentially need -0.048957378 +-2.469975 an emergency -0.048957378 +-3.644728 Saving -0.048957378 +-3.102048 is earned -0.048957378 +-2.7835681 have earned -0.048957378 +-2.543778 money earned -0.048957378 +-1.7616906 be good -0.20946135 +-1.8140819 a good -0.13418062 +-2.4851818 is good -0.048957378 +-3.0035503 the good -0.048957378 +-2.3028388 many good -0.048957378 +-2.9159434 and good -0.048957378 +-3.0959117 to good -0.048957378 +-2.1185026 very good -0.048957378 +-2.273176 are good -0.048957378 +-1.2878569 Another good -0.048957378 +-2.3346374 more good -0.048957378 +-2.3781981 all good -0.048957378 +-1.2878569 another good -0.048957378 +-2.3142414 fs good -0.048957378 +-1.7143714 were good -0.048957378 +-1.8446414 getting good -0.048957378 +-2.4087367 this idea -0.048957378 +-2.5159528 the idea -0.20946135 +-1.8797346 little idea -0.048957378 +-2.4185147 an idea -0.048957378 +-1.6391098 bad idea -0.048957378 +-2.0122402 better idea -0.048957378 +-1.4990695 good idea -0.187397 +-2.3549309 fs idea -0.048957378 +-0.9930965 clear idea -0.048957378 +-2.3758469 The alternative -0.048957378 +-3.1303647 is asking -0.048957378 +-2.138573 with others -0.13618872 +-2.3429945 many others -0.048957378 +-3.126527 of others -0.048957378 +-2.6865494 on others -0.048957378 +-0.6934062 asking others -0.048957378 +-1.3753816 what others -0.27955192 +-2.059495 because others -0.048957378 +-0.6934062 innumerable others -0.048957378 +-2.2106335 is like -0.04895735 +-3.08607 the like -0.048957378 +-3.2241242 , like -0.048957378 +-2.4493282 jobs like -0.048957378 +-2.0711772 -LRB- like -0.048957378 +-1.992437 society like -0.048957378 +-1.167493 countries like -0.048957378 +-0.69284487 Skills like -0.048957378 +-0.99225736 seem like -0.048957378 +-0.99225736 fd like -0.048957378 +-0.58463883 seems like -0.048957378 +-0.69284487 tastes like -0.048957378 +-2.8356202 that parents -0.048957378 +-2.7172563 the parents -0.048957378 +-2.3502622 by parents -0.048957378 +-1.2830745 their parents -0.24459879 +-2.258325 if parents -0.048957378 +-2.0103927 's parents -0.048957378 +-1.8416208 like parents -0.048957378 +-1.2476163 your parents -0.048957378 +-2.055945 If parents -0.048957378 +-2.3142414 fs parents -0.048957378 +-2.3262591 my parents -0.048957378 +-1.1650699 whose parents -0.048957378 +-1.5245031 All parents -0.048957378 +-1.9344316 our parents -0.048957378 +-2.0122468 his parents -0.048957378 +-1.3826258 his\/her parents -0.048957378 +-2.1336222 with friends -0.048957378 +-3.0386198 and friends -0.048957378 +-3.087084 of friends -0.048957378 +-2.6223552 or friends -0.048957378 +-1.851442 new friends -0.048957378 +-1.8100215 my friends -0.048957378 +-1.4646233 My friends -0.048957378 +-1.948986 our friends -0.048957378 +-1.5302515 long friends -0.048957378 +-0.6931254 dissolute friends -0.048957378 +-1.9320052 However -0.58723795 +-2.6943817 be doing -0.048957378 +-2.8981466 is doing -0.048957378 +-2.5423484 student doing -0.048957378 +-2.396164 , doing -0.048957378 +-2.6300335 not doing -0.048957378 +-2.1851182 and doing -0.048957378 +-3.0017767 of doing -0.048957378 +-2.506493 from doing -0.048957378 +-2.6988692 time doing -0.048957378 +-2.5730033 or doing -0.048957378 +-2.7151973 are doing -0.048957378 +-1.582397 By doing -0.048957378 +-2.0474281 while doing -0.048957378 +-2.097896 up doing -0.048957378 +-1.632501 spent doing -0.048957378 +-3.246793 and again -0.048957378 +-2.1202812 there again -0.048957378 +-2.2854962 so again -0.048957378 +-3.1764908 and put -0.048957378 +-2.250529 to put -0.1765346 +-2.3815436 may put -0.048957378 +-2.434271 them put -0.048957378 +-0.9942178 fve put -0.048957378 +-2.2803774 some strains -0.048957378 +-3.284121 the relationships -0.048957378 +-0.9942178 interpersonal relationships -0.048957378 +-0.9942178 crucial relationships -0.048957378 +-0.8902731 workplace relationships -0.04895735 +-0.6938276 friendly relationships -0.048957378 +-1.7841849 with people -0.04895735 +-2.3405106 for people -0.048957378 +-2.6582167 that people -0.048957378 +-2.2324853 the people -0.048957378 +-1.747077 many people -0.048957378 +-2.4543896 and people -0.048957378 +-2.0001738 of people -0.048957378 +-2.3873038 from people -0.048957378 +-2.501654 as people -0.048957378 +-1.5582286 Such people -0.048957378 +-1.8272774 less people -0.048957378 +-2.5535123 college people -0.048957378 +-2.0481818 most people -0.048957378 +-1.3684433 Most people -0.048957378 +-1.718035 Many people -0.048957378 +-2.2223701 when people -0.048957378 +-1.3638679 other people -0.048957378 +-2.2828953 all people -0.048957378 +-1.9591949 make people -0.048957378 +-1.9439257 where people -0.048957378 +-1.4450549 different people -0.048957378 +-0.689353 Young people -0.048957378 +-0.9450489 Some people -0.048957378 +-1.5604409 meet people -0.048957378 +-1.8015715 new people -0.048957378 +-1.8723335 hard people -0.048957378 +-0.38554367 young people -0.087055564 +-1.1564206 rounded people -0.048957378 +-1.5049624 All people -0.048957378 +-1.2764184 Those people -0.048957378 +-0.98532706 meeting people -0.048957378 +-0.689353 viable people -0.048957378 +-0.689353 aged people -0.048957378 +-0.689353 elderly people -0.048957378 +-0.689353 employ people -0.048957378 +-0.689353 urge people -0.048957378 +-1.2764184 rights people -0.048957378 +-2.7925994 have borrowed -0.048957378 +-2.1655056 you borrowed -0.048957378 +-1.4473944 They -0.28012446 +-2.6214874 student eventually -0.048957378 +-2.7167566 and eventually -0.048957378 +-2.3815436 may eventually -0.048957378 +-2.2780018 who eventually -0.048957378 +-2.4379478 all eventually -0.048957378 +-2.6128922 student see -0.048957378 +-2.8730767 to see -0.048957378 +-2.933356 students see -0.048957378 +-2.8121924 they see -0.048957378 +-1.4679527 eventually see -0.048957378 +-1.8866099 really see -0.048957378 +-3.3047535 a evampire -0.048957378 +-0.6942867 evampire f. -0.048957378 +-1.3798332 This -0.21703526 +-0.6942267 f. This -0.048957378 +-3.0414262 is especially -0.048957378 +-3.3677967 , especially -0.048957378 +-3.1451926 and especially -0.048957378 +-1.2934115 carefully especially -0.048957378 +-1.6870786 employers especially -0.048957378 +-2.4508684 people especially -0.048957378 +-3.0414262 is true -0.048957378 +-2.5346918 the true -0.04895735 +-1.4683161 especially true -0.048957378 +-1.2934115 particularly true -0.048957378 +-1.9589672 our true -0.048957378 +-0.69368714 rings true -0.048957378 +-1.8620814 College -0.38104916 +-3.428628 , College -0.048957378 +-2.364556 many College -0.048957378 +-2.0151875 from College -0.048957378 +-2.1471612 will show -0.048957378 +-3.4163227 to show -0.048957378 +-2.410627 also show -0.048957378 +-2.84126 they fre -0.048957378 +-2.162571 you fre -0.048957378 +-1.6454853 There fre -0.048957378 +-1.295562 fre putting -0.048957378 +-0.6942267 started putting -0.048957378 +-2.5476494 the effort -0.04895735 +-2.0984125 any effort -0.048957378 +-3.2102208 and effort -0.048957378 +-3.21812 of effort -0.048957378 +-2.7357929 as keeping -0.048957378 +-2.1050158 into keeping -0.048957378 +-3.2908964 a stable -0.048957378 +-1.6467657 financially stable -0.048957378 +-2.569465 People -0.048957378 +-2.8983755 are willing -0.048957378 +-2.415659 more willing -0.048957378 +-1.5902163 can help -0.048957378 +-1.370441 will help -0.04895735 +-2.6486106 not help -0.048957378 +-1.787646 to help -0.11268411 +-2.648625 as help -0.048957378 +-1.8203847 also help -0.187397 +-1.9133708 jobs help -0.187397 +-2.3477387 may help -0.048957378 +-1.6435541 should help -0.04895735 +-2.751095 they help -0.048957378 +-1.5887513 could help -0.048957378 +-1.462143 usually help -0.048957378 +-1.5836447 perhaps help -0.048957378 +-2.6513362 that when -0.048957378 +-2.4861102 job when -0.048957378 +-2.6630864 is when -0.048957378 +-2.3315365 , when -0.048957378 +-1.9360907 field when -0.048957378 +-2.636664 and when -0.048957378 +-2.2129443 study when -0.048957378 +-2.6798315 students when -0.048957378 +-2.54811 time when -0.048957378 +-2.1814945 school when -0.048957378 +-1.9957185 -LRB- when -0.048957378 +-2.157253 so when -0.048957378 +-2.013774 even when -0.048957378 +-2.2306337 more when -0.048957378 +-1.4230365 better when -0.048957378 +-1.7206336 food when -0.048957378 +-2.2870786 people when -0.048957378 +-2.1210685 help when -0.048957378 +-2.2408266 do when -0.187397 +-1.6671537 opportunity when -0.048957378 +-1.1584141 distractions when -0.048957378 +-1.3701252 grades when -0.048957378 +-0.68921393 useless when -0.048957378 +-2.2066634 fs when -0.048957378 +-2.1496866 education when -0.048957378 +-1.5040536 quit when -0.048957378 +-1.6856871 lives when -0.048957378 +-1.9400117 things when -0.048957378 +-1.156013 discover when -0.048957378 +-1.614826 later when -0.048957378 +-0.9850521 fact when -0.048957378 +-1.2758812 stage when -0.048957378 +-0.9850521 car when -0.048957378 +-0.68921393 bubble when -0.048957378 +-0.68921393 established when -0.048957378 +-0.68921393 whatsoever when -0.048957378 +-0.9850521 ill when -0.048957378 +-0.68921393 pains when -0.048957378 +-3.4931862 , trying -0.048957378 +-2.0087757 are trying -0.27955192 +-2.1099067 I do -0.11268411 +-2.7681105 that do -0.048957378 +-1.4528781 often do -0.048957378 +-2.562779 not do -0.048957378 +-2.8063323 and do -0.048957378 +-1.501259 to do -0.13395807 +-2.0476158 students do -0.04895735 +-2.4553785 should do -0.048957378 +-2.1608806 only do -0.048957378 +-1.5954541 they do -0.3464987 +-1.7385017 To do -0.048957378 +-2.055131 even do -0.048957378 +-1.6869833 who do -0.13618872 +-2.356268 people do -0.048957378 +-1.4539756 eventually do -0.048957378 +-2.2579482 would do -0.048957378 +-1.6210366 families do -0.048957378 +-1.8292019 Japan do -0.048957378 +-1.669193 age do -0.048957378 +-1.3674418 we do -0.04895735 +-1.1621677 general do -0.048957378 +-0.691305 programs do -0.048957378 +-0.691305 Nor do -0.048957378 +-2.2247844 is something -0.13618872 +-2.192501 studying something -0.048957378 +-2.087322 -LRB- something -0.048957378 +-1.5355865 understanding something -0.048957378 +-2.0006297 doing something -0.048957378 +-2.362637 do something -0.04895735 +-1.8009129 want something -0.048957378 +-2.9719586 in your -0.048957378 +-2.543445 to your -0.13618872 +-2.657421 on your -0.048957378 +-2.2703896 if your -0.048957378 +-2.1321318 but your -0.048957378 +-1.8527182 pay your -0.048957378 +-2.6967838 have your -0.048957378 +-0.40896884 expanding your -0.048957378 +-0.99225736 increasing your -0.048957378 +-1.5862222 earning your -0.048957378 +-0.99225736 enjoying your -0.048957378 +-0.99225736 namely your -0.048957378 +-1.7205752 to earn -0.35108593 +-2.821667 they earn -0.048957378 +-1.9147383 ft earn -0.048957378 +-1.5351007 actually earn -0.048957378 +-2.276979 we earn -0.048957378 +-2.8890827 for other -0.048957378 +-2.7474363 the other -0.40449065 +-1.4586887 many other -0.04895735 +-1.5134407 any other -0.048957378 +-2.9592848 in other -0.048957378 +-2.6349013 and other -0.048957378 +-3.1516366 to other -0.048957378 +-2.305698 The other -0.048957378 +-2.1078584 or other -0.048957378 +-1.718434 know other -0.048957378 +-2.2460973 what other -0.048957378 +-2.0018208 about other -0.048957378 +-0.991978 affect other -0.048957378 +-2.4545655 part-time while -0.048957378 +-1.8528453 job while -0.2463506 +-3.11635 , while -0.048957378 +-1.440128 working while -0.187397 +-2.6759744 time while -0.048957378 +-2.4164965 jobs while -0.048957378 +-1.6769817 work while -0.187397 +-1.750253 tuition while -0.048957378 +-2.0756736 parents while -0.048957378 +-0.69200426 undertaken while -0.048957378 +-1.627021 force while -0.048957378 +-1.5226038 So while -0.048957378 +-1.6715331 worked while -0.048957378 +-1.7815877 workplace while -0.048957378 +-1.9791558 things while -0.048957378 +-0.99058396 married while -0.048957378 +-0.69200426 mentors while -0.048957378 +-1.5793368 spending while -0.048957378 +-2.569465 Of -0.20946135 +-3.2358162 a major -0.048957378 +-3.2102208 and major -0.048957378 +-2.4448311 their major -0.048957378 +-1.2530801 your major -0.13618872 +-1.8913437 an advantage -0.048957378 +-3.120598 is teaching -0.048957378 +-1.1708648 concerned teaching -0.048957378 +-2.6259773 can take -0.048957378 +-2.6350343 will take -0.048957378 +-2.6443284 and take -0.048957378 +-1.986623 to take -0.11268411 +-2.507319 should take -0.048957378 +-1.6518091 only take -0.048957378 +-2.7593143 they take -0.048957378 +-2.1322224 you take -0.048957378 +-2.0280702 must take -0.048957378 +-2.2994013 would take -0.048957378 +-1.6343127 families take -0.048957378 +-1.2900703 easily take -0.048957378 +-2.7796392 job tutoring -0.048957378 +-2.6331224 with high -0.048957378 +-2.6583917 a high -0.048957378 +-2.9592848 in high -0.048957378 +-3.033918 of high -0.048957378 +-2.1282482 very high -0.048957378 +-2.5191436 from high -0.048957378 +-2.7391284 are high -0.048957378 +-2.078092 into high -0.048957378 +-0.6927047 tutoring high -0.048957378 +-1.9059 too high -0.048957378 +-0.991978 Getting high -0.048957378 +-0.991978 extremely high -0.048957378 +-0.6927047 Achieving high -0.048957378 +-1.9786379 a chance -0.5805819 +-2.5411224 the chance -0.13618872 +-2.0431504 first chance -0.048957378 +-1.3902439 last chance -0.048957378 +-1.5924584 best chance -0.048957378 +-2.64845 can use -0.048957378 +-2.3711007 to use -0.20946135 +-1.8797346 little use -0.048957378 +-2.784949 they use -0.048957378 +-2.0878363 into use -0.048957378 +-2.064357 good use -0.048957378 +-2.035199 must use -0.048957378 +-1.5880291 best use -0.048957378 +-0.6932658 normally use -0.048957378 +-2.8842342 the theories -0.048957378 +-3.2979217 and practices -0.048957378 +-2.4197767 for classes -0.048957378 +-2.727085 the classes -0.048957378 +-3.1669025 , classes -0.048957378 +-2.9349952 in classes -0.048957378 +-2.934131 and classes -0.048957378 +-3.0017767 of classes -0.048957378 +-3.113704 to classes -0.048957378 +-2.388789 their classes -0.187397 +-2.1651096 such classes -0.048957378 +-2.2306914 some classes -0.048957378 +-1.7899655 taking classes -0.048957378 +-2.3312085 my classes -0.048957378 +-1.1654861 attend classes -0.048957378 +-0.99141985 Attending classes -0.048957378 +-1.2893543 attending classes -0.048957378 +-2.7282374 I fll -0.048957378 +-2.162571 you fll -0.048957378 +-1.9188446 They fll -0.048957378 +-2.1342583 will know -0.048957378 +-3.2641199 to know -0.048957378 +-1.6844487 employers know -0.048957378 +-1.9072032 They know -0.048957378 +-1.292295 fll know -0.048957378 +-1.7243168 n't know -0.04895735 +-1.8823773 really know -0.048957378 +-2.2645695 we know -0.048957378 +-2.533827 I what -0.048957378 +-2.5359979 with what -0.048957378 +-2.7506387 for what -0.048957378 +-2.2438226 , what -0.1722646 +-2.1471057 and what -0.048957378 +-2.015888 of what -0.11268411 +-2.69565 to what -0.048957378 +-2.4374714 from what -0.048957378 +-1.803285 on what -0.13618872 +-2.0260594 -LRB- what -0.048957378 +-2.4733903 or what -0.048957378 +-2.280254 do what -0.048957378 +-1.140326 know what -0.048957378 +-1.2818278 nor what -0.048957378 +-1.1605179 Learning what -0.048957378 +-2.0275047 out what -0.048957378 +-2.1385293 learn what -0.048957378 +-0.9880859 sacrifice what -0.048957378 +-1.5188853 appreciate what -0.048957378 +-1.3799449 realize what -0.048957378 +-1.1605179 yet what -0.048957378 +-1.2818278 determine what -0.048957378 +-1.162232 choosing what -0.048957378 +-1.514158 us what -0.048957378 +-0.69074637 payback what -0.048957378 +-0.69074637 destroying what -0.048957378 +-0.69074637 tell what -0.048957378 +-2.2898738 one works -0.048957378 +-2.2892714 what works -0.048957378 +-2.2910626 what doesn -0.048957378 +-1.4716616 don ft. -0.048957378 +-0.69410884 doesn ft. -0.048957378 +-0.69410884 hadn ft. -0.048957378 +-1.8079702 I could -0.04895735 +-2.324456 this could -0.048957378 +-2.076529 that could -0.04895735 +-2.5774457 job could -0.048957378 +-3.0300741 , could -0.048957378 +-2.7927094 and could -0.048957378 +-1.829094 course could -0.048957378 +-2.2640738 study could -0.048957378 +-2.3481624 experience could -0.048957378 +-2.1455538 studies could -0.048957378 +-2.3859735 jobs could -0.048957378 +-2.0946295 but could -0.048957378 +-1.5759982 experiences could -0.048957378 +-1.3771682 industry could -0.048957378 +-1.5218195 parents could -0.048957378 +-2.0457268 This could -0.048957378 +-1.4531673 major could -0.048957378 +-1.2834638 nor could -0.048957378 +-1.5184352 graduates could -0.048957378 +-0.988917 solution could -0.048957378 +-0.69116527 gifted could -0.048957378 +-0.69116527 Universities could -0.048957378 +-0.69116527 Which could -0.048957378 +-1.1617546 man could -0.048957378 +-3.4931862 , expanding -0.048957378 +-2.4177067 by expanding -0.048957378 +-1.8383135 your horizons -0.048957378 +-1.1708648 thus increasing -0.048957378 +-1.295562 ever increasing -0.048957378 +-2.2010837 job opportunities -0.187397 +-3.4163227 to opportunities -0.048957378 +-2.3673859 The opportunities -0.048957378 +-1.8487579 job after -0.04895735 +-2.8235707 is after -0.048957378 +-2.5009751 student after -0.048957378 +-2.6542263 time after -0.048957378 +-1.8691221 work after -0.04895735 +-2.6244867 have after -0.048957378 +-1.624704 offer after -0.048957378 +-2.0592365 into after -0.048957378 +-1.9518652 enough after -0.048957378 +-2.3033705 do after -0.048957378 +-1.1629949 opportunities after -0.048957378 +-2.3535383 all after -0.048957378 +-2.360202 them after -0.048957378 +-2.3337302 life after -0.048957378 +-1.5750064 adult after -0.048957378 +-0.6915846 encounter after -0.048957378 +-1.5197701 until after -0.048957378 +-1.752015 home after -0.048957378 +-0.98974967 got after -0.048957378 +-0.6915846 teenagers after -0.048957378 +-0.6915846 Massachusetts after -0.048957378 +-2.0329945 In conclusion -0.20946135 +-2.7181385 I encourage -0.048957378 +-2.8944666 to encourage -0.048957378 +-2.5494218 should encourage -0.048957378 +-2.1138942 even encourage -0.048957378 +-1.7724454 for all -0.04895735 +-2.4961305 it all -0.048957378 +-2.4818623 be all -0.048957378 +-2.8685791 , all -0.048957378 +-2.4532244 not all -0.048957378 +-1.9334091 In all -0.048957378 +-2.153808 in all -0.37638456 +-1.9970981 of all -0.048957378 +-1.7129669 For all -0.048957378 +-2.2904274 experience all -0.048957378 +-1.9499487 from all -0.048957378 +-1.3695211 tired all -0.048957378 +-1.7837205 focus all -0.048957378 +-2.3924165 or all -0.048957378 +-1.632078 are all -0.099168696 +-1.2690912 at all -0.048957378 +-2.0151947 spend all -0.048957378 +-1.5561762 put all -0.048957378 +-2.2828355 people all -0.048957378 +-1.8412559 They all -0.048957378 +-2.2168543 when all -0.048957378 +-1.6478826 use all -0.048957378 +-2.0753345 after all -0.048957378 +-1.3671163 encourage all -0.048957378 +-2.2896473 them all -0.048957378 +-1.904745 believe all -0.048957378 +-2.2708921 life all -0.048957378 +-1.5031465 explore all -0.048957378 +-0.9847774 Should all -0.048957378 +-1.7355717 understand all -0.048957378 +-2.153615 we all -0.048957378 +-1.5584989 complete all -0.048957378 +-1.3695211 within all -0.048957378 +-1.2753446 cover all -0.048957378 +-0.6890748 covered all -0.048957378 +-0.6890748 behooves all -0.048957378 +-1.1556058 indeed all -0.048957378 +-0.6890748 standing all -0.048957378 +-0.6890748 amongst all -0.048957378 +-1.3799049 disagree because -0.048957378 +-1.056526 statement because -0.187397 +-2.7771356 that because -0.048957378 +-2.5922155 job because -0.048957378 +-2.4548295 is because -0.048957378 +-3.0569723 , because -0.048957378 +-2.3959112 jobs because -0.048957378 +-2.230697 school because -0.048957378 +-2.1403506 college because -0.048957378 +-2.3037157 more because -0.048957378 +-0.9894719 advantage because -0.048957378 +-1.5188298 workers because -0.048957378 +-0.9894719 unfortunate because -0.048957378 +-1.622227 But because -0.048957378 +-1.8079246 responsibility because -0.048957378 +-1.5188298 simply because -0.048957378 +-1.52019 banned because -0.048957378 +-1.1625811 activity because -0.048957378 +-1.1625811 behind because -0.048957378 +-1.2845579 properly because -0.048957378 +-1.2845579 places because -0.048957378 +-0.69144475 ages because -0.048957378 +-2.5221097 the opportunity -0.27955192 +-2.3354475 The opportunity -0.048957378 +-1.5555515 an opportunity -0.11268411 +-2.3557408 important opportunity -0.048957378 +-1.4662849 educational opportunity -0.048957378 +-2.067544 good opportunity -0.048957378 +-1.9319644 no opportunity -0.048957378 +-0.6934062 wonderful opportunity -0.048957378 +-2.4724946 with them -0.048957378 +-1.6759287 for them -0.14912641 +-2.49764 of them -0.048957378 +-2.284967 to them -0.11268411 +-2.3825915 from them -0.048957378 +-1.6080599 benefit them -0.048957378 +-1.0032557 gives them -0.048957378 +-0.9719988 help them -0.09038658 +-1.8910046 take them -0.048957378 +-1.3677793 encourage them -0.048957378 +-1.5572013 teach them -0.048957378 +-1.0969558 give them -0.099168696 +-1.9568233 make them -0.048957378 +-1.156013 allowing them -0.048957378 +-1.603563 force them -0.048957378 +-0.4075199 prepares them -0.187397 +-1.156013 giving them -0.048957378 +-0.68921393 awaits them -0.048957378 +-1.8533039 support them -0.048957378 +-0.47519362 allows them -0.50140065 +-1.508652 let them -0.048957378 +-1.156013 enables them -0.048957378 +-0.4738289 teaches them -0.048957378 +-0.9850521 brought them -0.048957378 +-0.5822481 preparing them -0.048957378 +-0.68921393 kill them -0.048957378 +-1.2758812 serve them -0.048957378 +-1.156013 cause them -0.048957378 +-1.374476 interest them -0.048957378 +-0.879848 assist them -0.13618872 +-1.156013 prepare them -0.048957378 +-0.68921393 encouraging them -0.048957378 +-0.68921393 treating them -0.048957378 +-0.68921393 distracts them -0.048957378 +-0.68921393 fed them -0.048957378 +-0.68921393 impress them -0.048957378 +-0.68921393 guide them -0.048957378 +-0.68921393 suits them -0.048957378 +-3.0230536 their present -0.048957378 +-0.6942867 present situations -0.048957378 +-3.0230536 their futures -0.048957378 +-2.7324119 Japanese -0.04895735 +-2.9032245 that Japanese -0.048957378 +-3.087084 of Japanese -0.048957378 +-3.215579 to Japanese -0.048957378 +-2.2765498 if Japanese -0.048957378 +-2.1257288 most Japanese -0.048957378 +-1.3873869 As Japanese -0.048957378 +-1.5302515 workers Japanese -0.048957378 +-2.0924752 young Japanese -0.048957378 +-0.6931254 typical Japanese -0.048957378 +-2.3701181 many distractions -0.048957378 +-1.8928705 little distractions -0.048957378 +-1.9898291 enough distractions -0.048957378 +-2.9516091 for real -0.048957378 +-2.4001164 a real -0.04895735 +-2.0256789 the real -0.51468724 +-2.1770935 not real -0.048957378 +-3.0267124 in real -0.048957378 +-2.9101167 their real -0.048957378 +-2.3354475 The real -0.048957378 +-2.8054512 are real -0.048957378 +-3.0607083 is therefore -0.048957378 +-2.9280992 , therefore -0.048957378 +-2.7167566 and therefore -0.04895735 +-3.3489213 to therefore -0.048957378 +-1.8884475 It therefore -0.048957378 +-2.109757 can provide -0.13618872 +-2.643165 will provide -0.048957378 +-2.668018 not provide -0.048957378 +-2.8238015 to provide -0.048957378 +-1.8236004 also provide -0.048957378 +-1.9166929 jobs provide -0.048957378 +-1.1671549 thus provide -0.048957378 +-2.2535412 they provide -0.048957378 +-1.463795 colleges provide -0.048957378 +-2.3033736 would provide -0.048957378 +-1.9283917 years provide -0.048957378 +-2.7689722 be useful -0.048957378 +-2.9155905 that useful -0.048957378 +-3.1138833 a useful -0.048957378 +-2.6598978 will useful -0.048957378 +-2.1415925 very useful -0.048957378 +-2.4326057 people useful -0.048957378 +-2.4152584 all useful -0.048957378 +-1.9312015 provide useful -0.048957378 +-0.9930965 extremely useful -0.048957378 +-2.3846152 this society -0.048957378 +-2.7474363 the society -0.048957378 +-2.207863 in society -0.04895735 +-2.35614 of society -0.11268411 +-2.805557 to society -0.048957378 +-2.8372147 their society -0.048957378 +-0.85448766 Japanese society -0.13618872 +-2.3312151 fs society -0.048957378 +-1.4629536 free society -0.048957378 +-0.6927047 broader society -0.048957378 +-0.991978 modern society -0.048957378 +-0.6927047 consumerist society -0.048957378 +-0.6927047 @ society -0.048957378 +-3.014355 their specialized -0.048957378 +-1.295562 highly specialized -0.048957378 +-3.09512 Workers -0.048957378 +-2.3059957 be expected -0.187397 +-2.7556913 not expected -0.048957378 +-2.8837602 are expected -0.048957378 +-2.920025 to function -0.048957378 +-3.3047535 a broad -0.048957378 +-3.4000094 the range -0.048957378 +-0.6942267 broad range -0.048957378 +-3.28005 of contexts -0.048957378 +-3.644728 Evaluations -0.048957378 +-3.3032444 must -0.048957378 +-2.6444798 it must -0.048957378 +-1.7280687 student must -0.04895735 +-2.9159434 and must -0.048957378 +-2.364132 students must -0.04895735 +-2.3363032 also must -0.048957378 +-2.2397017 one must -0.048957378 +-2.240784 they must -0.048957378 +-1.629438 There must -0.048957378 +-1.8745332 people must -0.048957378 +-0.9911411 Workers must -0.048957378 +-1.5245031 workers must -0.048957378 +-1.673106 Students must -0.048957378 +-1.3836298 she must -0.048957378 +-2.2331133 we must -0.048957378 +-0.6922843 context must -0.048957378 +-2.880715 be familiar -0.048957378 +-3.1699133 of done -0.048957378 +-2.170065 has done -0.048957378 +-1.8597006 being done -0.048957378 +-2.7526727 have done -0.04895735 +-2.004456 things done -0.048957378 +-1.5344893 anything done -0.048957378 +-1.8747771 this area -0.048957378 +-2.4503036 their area -0.187397 +-2.393125 fs area -0.048957378 +-2.7232327 of specialty -0.048957378 +-1.4597349 If -0.14450496 +-3.0607083 is n't -0.048957378 +-1.1330452 do n't -0.13332285 +-1.5933044 did n't -0.048957378 +-0.40935975 wo n't -0.048957378 +-0.40935975 ca n't -0.048957378 +-2.7082691 I whatever -0.048957378 +-2.9939594 for whatever -0.048957378 +-3.3971481 , whatever -0.048957378 +-2.709754 on whatever -0.048957378 +-2.3718047 do whatever -0.048957378 +-2.7329264 job might -0.048957378 +-2.4294076 experience might -0.048957378 +-2.8121924 they might -0.048957378 +-1.16925 employer might -0.048957378 +-2.2734885 what might -0.048957378 +-2.272803 we might -0.048957378 +-2.8497334 they produce -0.048957378 +-1.4712842 might produce -0.048957378 +-3.4625692 , fit -0.048957378 +-2.1934187 not fit -0.048957378 +-3.4163227 to fit -0.048957378 +-3.644728 Neither -0.048957378 +-2.637264 , nor -0.04895735 +-1.1708648 customer nor -0.048957378 +-2.7708678 Learning -0.04895735 +-2.4029973 by itself -0.048957378 +-3.0728445 in itself -0.048957378 +-2.5414824 working itself -0.048957378 +-1.4687891 factor itself -0.048957378 +-0.9942178 manifest itself -0.048957378 +-3.644728 Because -0.048957378 +-2.0522041 's external -0.048957378 +-3.014355 their subject -0.048957378 +-0.6942267 specific subject -0.048957378 +-2.722524 will interfere -0.048957378 +-2.5091865 jobs interfere -0.048957378 +-2.4448311 their grades -0.048957378 +-2.0264761 better grades -0.048957378 +-2.08053 good grades -0.048957378 +-1.8357852 high grades -0.048957378 +-2.7309246 with specialization -0.048957378 +-1.3904605 It -0.36882287 +-3.0414262 is usually -0.048957378 +-2.6862712 will usually -0.048957378 +-2.8121924 they usually -0.048957378 +-2.0392778 's usually -0.048957378 +-2.8351026 are usually -0.048957378 +-1.727343 n't usually -0.048957378 +-2.2753892 some menial -0.048957378 +-2.010357 doing menial -0.048957378 +-1.4704666 usually menial -0.048957378 +-3.3047535 a specialist -0.048957378 +-3.1303647 is useless -0.048957378 +-2.5969746 I get -0.048957378 +-1.5868676 can get -0.099168696 +-2.109548 will get -0.048957378 +-2.3463118 and get -0.04895735 +-1.920955 to get -0.071063936 +-2.3583682 students get -0.048957378 +-2.2329066 one get -0.048957378 +-2.478221 should get -0.048957378 +-1.4958241 they get -0.09038658 +-2.5458775 or get -0.048957378 +-1.8848168 They get -0.048957378 +-2.1286578 could get -0.048957378 +-2.3731618 them get -0.048957378 +-1.7094948 n't get -0.048957378 +-1.45804 usually get -0.048957378 +-1.8618116 really get -0.048957378 +-1.3812549 ultimately get -0.048957378 +-1.5226038 actually get -0.048957378 +-1.5379899 chosen fields -0.048957378 +-1.4712842 different fields -0.048957378 +-3.09512 Internships -0.187397 +-1.9441198 provide relevant -0.048957378 +-1.3916435 various relevant -0.048957378 +-1.1705118 areas relevant -0.048957378 +-2.842708 be another -0.048957378 +-1.8640044 find another -0.048957378 +-1.3909432 hold another -0.048957378 +-0.9944986 Yet another -0.048957378 +-1.295562 another source -0.048957378 +-1.9687632 our source -0.048957378 +-3.2676826 of temptation -0.048957378 +-1.1708648 sample temptation -0.048957378 +-2.968909 that colleges -0.048957378 +-3.284121 the colleges -0.048957378 +-3.1764908 and colleges -0.048957378 +-2.9605632 their colleges -0.048957378 +-1.7680029 Japanese colleges -0.048957378 +-3.2496762 the club -0.048957378 +-2.9430885 their club -0.048957378 +-2.5665698 from club -0.048957378 +-2.6663196 or club -0.048957378 +-1.4683161 whether club -0.048957378 +-1.9376127 provide club -0.048957378 +-2.5041373 in activities -0.048957378 +-2.6748512 college activities -0.048957378 +-1.6757944 These activities -0.048957378 +-1.6318686 same activities -0.048957378 +-0.8840058 club activities -0.048957378 +-1.5590689 social activities -0.04895735 +-1.6380178 these activities -0.048957378 +-0.6925645 genuine activities -0.048957378 +-1.1659027 community activities -0.048957378 +-0.3786424 extra-curricular activities -0.04895735 +-0.6925645 Said activities -0.048957378 +-0.6925645 extracurricular activities -0.048957378 +-0.99169886 recreational activities -0.048957378 +-0.6925645 group activities -0.048957378 +-2.9818435 students socialize -0.048957378 +-2.8983755 are sufficient -0.048957378 +-1.8673797 find sufficient -0.048957378 +-2.1342583 will teach -0.187397 +-3.2641199 to teach -0.048957378 +-2.3805175 also teach -0.048957378 +-2.472681 jobs teach -0.048957378 +-2.3685565 may teach -0.048957378 +-1.5901128 helps teach -0.048957378 +-1.908163 ft teach -0.048957378 +-2.4193864 them teach -0.048957378 +-3.3032444 how -0.048957378 +-2.8273203 , how -0.048957378 +-2.9233415 in how -0.048957378 +-2.9159434 and how -0.13618872 +-2.9865563 of how -0.048957378 +-2.8372316 students how -0.048957378 +-2.1193862 you how -0.048957378 +-2.4628558 at how -0.048957378 +-2.3820214 them how -0.048957378 +-1.1701529 learning how -0.27955192 +-1.2899935 learn how -0.27955192 +-1.757883 understand how -0.048957378 +-1.8416208 Japan how -0.048957378 +-0.9911411 knew how -0.048957378 +-1.5832012 hand how -0.048957378 +-1.1650699 taught how -0.048957378 +-3.4650414 to behave -0.048957378 +-2.6044638 student away -0.048957378 +-2.2275927 time away -0.187397 +-1.9577026 take away -0.048957378 +-2.4242916 them away -0.048957378 +-1.467546 takes away -0.048957378 +-0.69354665 moved away -0.048957378 +-0.69354665 possess away -0.048957378 +-1.8144643 will give -0.11268411 +-2.4025857 and give -0.27955192 +-2.8627641 to give -0.048957378 +-1.923414 jobs give -0.187397 +-2.80292 they give -0.048957378 +-1.2928529 fll give -0.048957378 +-2.3196363 would give -0.048957378 +-2.4510674 them unstructured -0.048957378 +-2.561604 with social -0.048957378 +-2.654057 the social -0.048957378 +-3.043315 , social -0.048957378 +-1.947863 and social -0.12260215 +-2.892758 of social -0.048957378 +-2.7202551 their social -0.048957378 +-1.2840105 highly social -0.048957378 +-1.9849048 future social -0.048957378 +-2.3034658 important social -0.048957378 +-1.5086619 into social -0.048957378 +-0.691305 unstructured social -0.048957378 +-1.3778466 immediate social -0.048957378 +-1.3778466 various social -0.048957378 +-1.1621677 building social -0.048957378 +-2.1489596 learn social -0.048957378 +-0.9891944 joining social -0.048957378 +-1.9180458 our social -0.048957378 +-1.574284 develop social -0.048957378 +-0.9891944 stronger social -0.048957378 +-0.691305 inherent social -0.048957378 +-0.691305 communicative social -0.048957378 +-0.9891944 changing social -0.048957378 +-0.691305 exciting social -0.048957378 +-2.7309246 with unpredictable -0.048957378 +-0.6942867 unpredictable results -0.048957378 +-2.0881407 become unwilling -0.048957378 +-2.709418 with fellow -0.048957378 +-2.9787707 their fellow -0.048957378 +-2.5811288 from fellow -0.048957378 +-2.3897333 my fellow -0.048957378 +-3.4931862 , teachers -0.048957378 +-3.2799814 and teachers -0.048957378 +-3.1303647 is unworthy -0.048957378 +-3.4204583 the finely -0.048957378 +-0.6942867 finely honed -0.048957378 +-3.2177634 the workers -0.048957378 +-3.1476786 of workers -0.048957378 +-2.7665734 time workers -0.048957378 +-2.4264548 all workers -0.048957378 +-0.69354665 honed workers -0.048957378 +-0.69354665 unmotivated workers -0.048957378 +-0.69354665 Restaurant workers -0.048957378 +-2.5542762 the demands -0.13618872 +-2.543778 money demands -0.048957378 +-2.0112066 society demands -0.048957378 +-2.1644623 I strongly -0.3194954 +-0.99501497 depends strongly -0.048957378 +-1.3882775 I believe -0.3018265 +-1.8301041 also believe -0.187397 +-1.9103436 ft believe -0.048957378 +-1.1688302 strongly believe -0.40449065 +-1.8844885 really believe -0.048957378 +-1.1688302 truly believe -0.048957378 +-0.99365675 firmly believe -0.048957378 +-1.8426979 it fs -0.13332285 +-2.7957687 that fs -0.048957378 +-1.7236531 student fs -0.06505798 +-1.0015756 there fs -0.11268411 +-0.9294223 one fs -0.07613316 +-1.0063283 today fs -0.04895735 +-1.6246173 There fs -0.048957378 +-1.8566378 It fs -0.048957378 +-1.4576565 club fs -0.048957378 +-1.5760767 someone fs -0.048957378 +-1.5231947 let fs -0.048957378 +-1.2856548 anyone fs -0.048957378 +-1.7090132 person fs -0.048957378 +-0.9900276 break fs -0.048957378 +-1.7430158 individual fs -0.048957378 +-0.6917244 What fs -0.048957378 +-0.6917244 Mammon fs -0.048957378 +-0.6917244 father fs -0.048957378 +-0.6917244 grandfather fs -0.048957378 +-0.6917244 Let fs -0.048957378 +-2.3569517 the main -0.099168696 +-2.3673859 The main -0.099168696 +-1.595883 two main -0.187397 +-1.829562 a lot -0.27406517 +-2.276171 this life -0.048957378 +-1.7769203 for life -0.1722646 +-1.9681419 valuable life -0.048957378 +-2.4229236 student life -0.048957378 +-1.6830112 in life -0.19828911 +-2.6758318 and life -0.048957378 +-2.0048287 of life -0.1722646 +-2.3063238 their life -0.048957378 +-1.9315253 working life -0.048957378 +-2.3148212 students life -0.187397 +-2.1932824 school life -0.048957378 +-1.4935124 college life -0.099168696 +-1.9616446 future life -0.048957378 +-1.7548814 real life -0.048957378 +-1.6122129 useful life -0.048957378 +-1.2258745 social life -0.04895735 +-1.2721993 fs life -0.11268411 +-1.888749 university life -0.048957378 +-1.9663883 make life -0.048957378 +-1.6101788 professional life -0.048957378 +-1.7243153 enjoy life -0.048957378 +-1.9488368 about life -0.048957378 +-1.5613256 great life -0.048957378 +-0.9861529 complicated life -0.048957378 +-1.2780342 positive life -0.048957378 +-1.3725495 essential life -0.048957378 +-1.6183119 later life -0.048957378 +-1.9663883 his life -0.048957378 +-0.8020367 daily life -0.187397 +-0.9861529 everyday life -0.048957378 +-0.6897705 \/ life -0.048957378 +-0.9861529 private life -0.048957378 +-0.9861529 enjoying life -0.048957378 +-0.6897705 boring life -0.048957378 +-3.340304 , entering -0.048957378 +-3.2905889 to entering -0.048957378 +-2.3127947 are entering -0.048957378 +-1.9802761 through entering -0.048957378 +-1.4683348 transition entering -0.048957378 +-1.7279366 before entering -0.048957378 +-0.69354665 postpone entering -0.048957378 +-2.5628006 the workforce -0.13618872 +-1.1705118 Having prior -0.048957378 +-0.9947796 workforce prior -0.048957378 +-0.9947796 exists prior -0.048957378 +-1.7715586 Many recently-graduated -0.048957378 +-2.9060166 are woefully -0.048957378 +-0.6942867 woefully unprepared -0.048957378 +-3.4000094 the realities -0.048957378 +-0.99501497 mundane realities -0.048957378 +-2.7912447 the responsibilities -0.04895735 +-2.3377671 many responsibilities -0.048957378 +-3.0629127 and responsibilities -0.048957378 +-2.6676896 of responsibilities -0.048957378 +-0.5546695 additional responsibilities -0.13618872 +-2.3520532 important responsibilities -0.048957378 +-2.1284394 social responsibilities -0.048957378 +-1.2917377 balancing responsibilities -0.048957378 +-0.6932658 scholastic responsibilities -0.048957378 +-2.873293 be particularly -0.048957378 +-2.637264 , particularly -0.04895735 +-2.8554769 for university -0.048957378 +-2.9859018 a university -0.048957378 +-2.3028388 many university -0.048957378 +-2.9159434 and university -0.048957378 +-2.6201468 of university -0.048957378 +-1.4596765 leave university -0.048957378 +-2.0577664 then university -0.048957378 +-2.5637717 or university -0.048957378 +-1.4001786 through university -0.187397 +-1.9324389 at university -0.048957378 +-0.9911411 view university -0.048957378 +-0.6922843 hire university -0.048957378 +-1.1650699 At university -0.048957378 +-1.2878569 cover university -0.048957378 +-0.9911411 onto university -0.048957378 +-0.6922843 contemporary university -0.048957378 +-2.811696 be used -0.048957378 +-3.0414262 is used -0.048957378 +-2.0743403 become used -0.048957378 +-2.0047097 well used -0.048957378 +-2.8351026 are used -0.048957378 +-1.6424524 never used -0.048957378 +-3.244391 of staying -0.048957378 +-3.4163227 to staying -0.048957378 +-2.8837602 are staying -0.048957378 +-2.8122976 is up -0.048957378 +-1.7813659 taking up -0.048957378 +-1.3785262 show up -0.048957378 +-1.9241494 take up -0.048957378 +-1.669152 give up -0.048957378 +-1.1625811 staying up -0.048957378 +-1.4539922 make up -0.048957378 +-1.4547855 entirely up -0.048957378 +-1.1625811 weigh up -0.048957378 +-1.8350438 getting up -0.048957378 +-0.3782798 growing up -0.04895735 +-0.70787346 end up -0.048957378 +-0.70787346 taken up -0.048957378 +-0.69144475 ending up -0.048957378 +-0.9894719 setting up -0.048957378 +-0.69144475 passed up -0.048957378 +-0.69144475 speed up -0.048957378 +-0.9894719 grow up -0.048957378 +-0.69144475 add up -0.048957378 +-0.69144475 picking up -0.048957378 +-0.69144475 catch up -0.048957378 +-0.69144475 ended up -0.048957378 +-3.3677967 , late -0.048957378 +-2.0396311 often late -0.048957378 +-3.0569165 in late -0.048957378 +-2.9430885 their late -0.048957378 +-2.1244118 up late -0.048957378 +-1.5341265 until late -0.048957378 +-3.4000094 the night -0.048957378 +-2.5303967 at night -0.048957378 +-2.859086 be sleeping -0.048957378 +-2.9492288 , sleeping -0.048957378 +-1.4704666 poor sleeping -0.048957378 +-3.5096595 , neglecting -0.048957378 +-3.014355 their commitments -0.048957378 +-1.9211627 academic commitments -0.048957378 +-2.1287274 with no -0.048957378 +-2.506663 is no -0.04895735 +-3.2241242 , no -0.048957378 +-2.9719586 in no -0.048957378 +-2.9937658 and no -0.048957378 +-3.050927 of no -0.048957378 +-1.591366 has no -0.048957378 +-1.8788315 have no -0.04895735 +-2.6019366 or no -0.048957378 +-2.7516074 are no -0.048957378 +-1.8527182 had no -0.048957378 +-0.99225736 virtually no -0.048957378 +-2.9605632 their immediate -0.048957378 +-2.3543274 The immediate -0.048957378 +-1.3902439 show immediate -0.048957378 +-2.2677426 get immediate -0.048957378 +-1.9389359 no immediate -0.048957378 +-1.3925304 immediate repercussions -0.048957378 +-2.7312138 a result -0.048957378 +-3.321535 the result -0.048957378 +-2.3608074 The result -0.048957378 +-1.9176081 ; result -0.048957378 +-2.710804 it difficult -0.187397 +-2.2832484 be difficult -0.048957378 +-3.1356838 a difficult -0.048957378 +-3.0052547 is difficult -0.187397 +-1.4643939 often difficult -0.048957378 +-1.5891323 been difficult -0.048957378 +-2.0727339 learning difficult -0.048957378 +-1.1684108 sometimes difficult -0.048957378 +-2.1142278 can make -0.048957378 +-1.812036 will make -0.13618872 +-3.0629127 and make -0.048957378 +-1.7910886 to make -0.067588836 +-2.3643124 may make -0.048957378 +-1.9709816 should make -0.048957378 +-1.7599807 To make -0.048957378 +-2.4326057 people make -0.048957378 +-1.1679918 shall make -0.048957378 +-1.8350283 necessary lifestyle -0.048957378 +-2.0452929 's lifestyle -0.048957378 +-1.5935729 adult lifestyle -0.048957378 +-0.69396824 well-rounded lifestyle -0.048957378 +-1.2958019 lifestyle adjustment -0.048957378 +-3.2979217 and subsequent -0.048957378 +-1.7519542 A -0.16549833 +-0.99501497 c A -0.048957378 +-3.5096595 , undertaken -0.048957378 +-2.6891787 I still -0.048957378 +-2.7198546 it still -0.048957378 +-2.118745 can still -0.048957378 +-3.022964 is still -0.048957378 +-3.1159992 and still -0.048957378 +-2.3127947 are still -0.048957378 +-1.155879 while still -0.04895735 +-2.2967534 be quite -0.187397 +-2.2305765 is quite -0.04895735 +-2.0775375 become quite -0.048957378 +-1.3902439 itself quite -0.048957378 +-1.8887417 really quite -0.048957378 +-2.859086 be helpful -0.048957378 +-1.583305 very helpful -0.187397 +-1.5370556 quite helpful -0.187397 +-2.7187858 with regard -0.048957378 +-2.4476333 this regard -0.048957378 +-1.2950919 With regard -0.048957378 +-3.09512 Maintaining -0.048957378 +-2.0524554 valuable exercise -0.048957378 +-3.1138833 a personal -0.048957378 +-3.160214 the personal -0.048957378 +-2.526607 in personal -0.048957378 +-3.0629127 and personal -0.048957378 +-2.418458 their personal -0.04895735 +-2.0304089 's personal -0.048957378 +-2.4152584 all personal -0.048957378 +-1.3890452 managing personal -0.048957378 +-2.3621502 my personal -0.048957378 +-2.3748312 many recent -0.048957378 +-1.7708699 Many recent -0.048957378 +-2.9793792 for graduates -0.048957378 +-3.2496762 the graduates -0.048957378 +-1.9390326 university graduates -0.048957378 +-0.40930387 recent graduates -0.048957378 +-1.8593426 new graduates -0.048957378 +-0.69368714 minted graduates -0.048957378 +-1.5379899 graduates similarly -0.048957378 +-1.5956279 made similarly -0.048957378 +-2.7972748 have difficulty -0.048957378 +-3.0808864 is managing -0.048957378 +-2.9385355 , managing -0.187397 +-2.5503032 in managing -0.048957378 +-3.21812 of managing -0.048957378 +-3.3144484 , finances -0.048957378 +-2.9101167 their finances -0.048957378 +-2.2827988 if finances -0.048957378 +-1.8567461 family finances -0.048957378 +-2.0154495 about finances -0.048957378 +-2.3675284 my finances -0.048957378 +-2.0342937 his finances -0.048957378 +-0.6934062 unless finances -0.048957378 +-3.4000094 the responsibly -0.048957378 +-1.6467657 finances responsibly -0.048957378 +-2.3772526 many western -0.048957378 +-2.3701181 many countries -0.048957378 +-1.7695215 Many countries -0.048957378 +-0.69410884 western countries -0.048957378 +-2.45335 this unfortunate -0.048957378 +-3.120598 is unfortunate -0.048957378 +-3.4000094 the reality -0.048957378 +-0.99501497 unfortunate reality -0.048957378 +-2.0522888 often combines -0.048957378 +-3.4000094 the levels -0.048957378 +-1.8382648 high levels -0.048957378 +-3.0728445 in debt -0.048957378 +-3.1933482 of debt -0.048957378 +-2.7334032 college debt -0.048957378 +-2.025878 future debt -0.048957378 +-1.4696376 reduce debt -0.048957378 +-2.700248 with credit -0.048957378 +-2.839973 the credit -0.048957378 +-2.709754 on credit -0.048957378 +-2.093081 If credit -0.048957378 +-0.6938276 Using credit -0.048957378 +-1.6922824 use cards -0.048957378 +-0.21341059 credit cards -0.048957378 +-2.6128922 student loans -0.048957378 +-3.1699133 of loans -0.048957378 +-2.7018807 on loans -0.048957378 +-2.1419587 -RRB- loans -0.048957378 +-1.7661085 tuition loans -0.048957378 +-1.8308595 your loans -0.048957378 +-2.637264 , etc. -0.13618872 +-0.6942267 uniforms etc. -0.048957378 +-3.1293185 in grave -0.048957378 +-2.013212 financial peril -0.048957378 +-3.1215048 in providing -0.048957378 +-2.2432292 only providing -0.048957378 +-3.321535 the cash -0.048957378 +-3.21812 of cash -0.048957378 +-1.7676849 personal cash -0.048957378 +-0.69396824 substantial cash -0.048957378 +-1.2958019 cash flow -0.048957378 +-3.4000094 the least -0.048957378 +-1.9512843 at least -0.187397 +-2.4247963 a small -0.13618872 +-1.5384331 even small -0.048957378 +-2.8890827 for income -0.048957378 +-3.063924 the income -0.048957378 +-2.0716856 any income -0.048957378 +-2.6398885 of income -0.048957378 +-2.8372147 their income -0.048957378 +-1.4644498 additional income -0.048957378 +-2.305698 The income -0.048957378 +-2.3925824 an income -0.048957378 +-1.8243035 extra income -0.048957378 +-2.237829 some income -0.048957378 +-1.462143 added income -0.048957378 +-1.2895159 needed income -0.048957378 +-0.6927047 modest income -0.048957378 +-2.5237396 with which -0.048957378 +-2.5146606 , which -0.13618872 +-2.1738102 in which -0.13618872 +-2.7303507 and which -0.048957378 +-2.5436628 of which -0.187397 +-2.3280525 experience which -0.048957378 +-2.58383 work which -0.048957378 +-1.5665365 f which -0.048957378 +-2.4222057 money which -0.048957378 +-1.6157081 degree which -0.048957378 +-1.2807405 internships which -0.048957378 +-1.7689643 skills which -0.048957378 +-1.8141 environment which -0.048957378 +-1.9507065 doing which -0.048957378 +-1.1005982 something which -0.048957378 +-1.9674705 activities which -0.048957378 +-2.3046746 life which -0.048957378 +-1.7641524 responsibilities which -0.048957378 +-1.7910352 income which -0.048957378 +-2.070445 world which -0.048957378 +-0.98753273 `` which -0.048957378 +-1.709859 against which -0.048957378 +-1.3737917 system which -0.048957378 +-1.657756 week which -0.048957378 +-0.69046736 facts which -0.048957378 +-0.69046736 travel which -0.048957378 +-0.69046736 sum which -0.048957378 +-0.69046736 paths which -0.048957378 +-2.0748606 smoking which -0.048957378 +-3.4650414 to counter -0.048957378 +-2.1964433 such debts -0.048957378 +-2.7368789 I largely -0.048957378 +-2.5645084 is largely -0.048957378 +-1.9320052 Firstly -0.58723795 +-2.1028194 having been -0.048957378 +-2.1756747 has been -0.048957378 +-1.5503422 have been -0.048957378 +-1.644695 never been -0.048957378 +-2.7368789 I myself -0.048957378 +-2.6468444 student myself -0.048957378 +-1.4949656 I am -0.12183642 +-0.6942267 gI am -0.048957378 +-2.4103017 more aware -0.048957378 +-2.4699326 people aware -0.048957378 +-1.5370556 am aware -0.048957378 +-2.3701181 many pressures -0.048957378 +-1.4290154 financial pressures -0.048957378 +-2.1481786 social pressures -0.048957378 +-1.2958019 pressures generated -0.048957378 +-2.6254168 with living -0.048957378 +-3.018787 a living -0.048957378 +-3.1851513 , living -0.048957378 +-2.9531138 and living -0.048957378 +-2.6332078 of living -0.048957378 +-3.1322563 to living -0.048957378 +-2.8239956 their living -0.048957378 +-2.5824351 or living -0.048957378 +-2.3254833 fs living -0.048957378 +-1.715363 still living -0.048957378 +-1.9010408 those living -0.048957378 +-2.3362148 my living -0.048957378 +-1.3848785 daily living -0.187397 +-1.1659027 general living -0.048957378 +-1.7676849 personal expenses -0.048957378 +-0.2911761 living expenses -0.2246826 +-2.223746 these expenses -0.048957378 +-1.294531 cover expenses -0.048957378 +-2.7708678 Often -0.13618872 +-2.7096968 that these -0.048957378 +-2.5106988 , these -0.04895735 +-1.9554527 In these -0.048957378 +-2.1717682 in these -0.04895735 +-2.8156636 of these -0.048957378 +-2.902241 to these -0.048957378 +-1.7263924 For these -0.3855526 +-2.4218035 from these -0.048957378 +-2.3253055 students these -0.187397 +-2.546186 on these -0.048957378 +-2.5625246 have these -0.048957378 +-1.8171283 like these -0.048957378 +-2.2691412 do these -0.048957378 +-1.5673072 done these -0.048957378 +-1.5654893 teach these -0.048957378 +-1.9260192 during these -0.048957378 +-0.6903279 easing these -0.048957378 +-0.6903279 demonstrating these -0.048957378 +-1.3731195 improve these -0.048957378 +-1.5113788 explore these -0.048957378 +-1.1592846 weigh these -0.048957378 +-1.5184664 costs these -0.048957378 +-1.2801979 practice these -0.048957378 +-1.5166097 appreciate these -0.048957378 +-1.5113788 All these -0.048957378 +-1.2801979 pass these -0.048957378 +-0.6903279 Sometimes these -0.048957378 +-0.9872564 supporting these -0.048957378 +-0.6903279 across these -0.048957378 +-0.6903279 Between these -0.048957378 +-2.9060166 are met -0.048957378 +-3.3047535 a combination -0.048957378 +-3.5096595 , grants -0.048957378 +-2.6489563 with family -0.048957378 +-2.387958 a family -0.04895735 +-2.7687883 the family -0.04895735 +-3.0156138 and family -0.048957378 +-2.4083414 their family -0.27955192 +-2.612026 or family -0.048957378 +-2.0117862 future family -0.048957378 +-1.7606759 Japanese family -0.048957378 +-1.2906253 small family -0.048957378 +-1.9244446 own family -0.048957378 +-1.386072 every family -0.048957378 +-2.2313263 these savings -0.048957378 +-1.8673797 family savings -0.048957378 +-1.4715303 usually restrict -0.048957378 +-2.163114 job during -0.25144583 +-3.1493895 , during -0.048957378 +-2.2962427 study during -0.048957378 +-1.9587044 working during -0.048957378 +-2.6911027 time during -0.048957378 +-2.6731453 work during -0.048957378 +-1.9390066 themselves during -0.048957378 +-1.2878569 lifestyle during -0.048957378 +-2.075721 much during -0.048957378 +-1.6740857 worked during -0.048957378 +-1.9017588 hard during -0.048957378 +-0.6922843 wastes during -0.048957378 +-0.9911411 element during -0.048957378 +-0.9911411 unproductive during -0.048957378 +-1.3826258 goal during -0.048957378 +-0.6922843 playing during -0.048957378 +-3.2908964 a crucial -0.048957378 +-1.1708648 developing crucial -0.048957378 +-1.8729675 this period -0.048957378 +-3.2358162 a period -0.048957378 +-0.9944986 crucial period -0.048957378 +-1.1700908 critical period -0.048957378 +-2.8272688 the adult -0.187397 +-3.1451926 and adult -0.048957378 +-2.9430885 their adult -0.048957378 +-2.4390335 an adult -0.04895735 +-1.5348523 independent adult -0.048957378 +-0.69368714 functioning adult -0.048957378 +-2.9939594 for development -0.048957378 +-2.839973 the development -0.187397 +-3.1764908 and development -0.048957378 +-1.5924584 adult development -0.048957378 +-0.6938276 all-round development -0.048957378 +-2.9652722 for earning -0.048957378 +-2.7969828 be earning -0.048957378 +-2.941432 that earning -0.048957378 +-2.90795 , earning -0.048957378 +-3.1159992 and earning -0.048957378 +-2.3863747 also earning -0.048957378 +-2.8200238 are earning -0.048957378 +-3.3047535 a constructive -0.048957378 +-2.3846152 this way -0.048957378 +-3.036213 a way -0.048957378 +-2.0211258 valuable way -0.048957378 +-3.063924 the way -0.048957378 +-2.8372147 their way -0.048957378 +-2.5920763 or way -0.048957378 +-1.9205885 no way -0.048957378 +-0.6927047 constructive way -0.048957378 +-1.9198133 own way -0.048957378 +-1.5836447 great way -0.048957378 +-1.5836447 best way -0.048957378 +-2.3412793 my way -0.048957378 +-2.0203838 his way -0.048957378 +-3.28005 of easing -0.048957378 +-2.4133134 by allowing -0.048957378 +-3.246793 and allowing -0.048957378 +-0.9947796 whilst allowing -0.048957378 +-1.3541759 can lead -0.5805819 +-3.3971481 , lead -0.048957378 +-3.3489213 to lead -0.048957378 +-1.2794199 may lead -0.20946135 +-1.3902439 ultimately lead -0.048957378 +-2.453272 an independent -0.048957378 +-1.5034711 become independent -0.04895735 +-2.4039898 more independent -0.048957378 +-1.0617812 financially independent -0.048957378 +-1.9317044 Secondly -0.916374 +-0.6942267 distracter Secondly -0.048957378 +-2.1544547 with regards -0.187397 +-2.013934 employment prospects -0.048957378 +-2.5645084 is rarely -0.048957378 +-2.8983755 are rarely -0.048957378 +-2.2268438 only factor -0.048957378 +-2.3632116 important factor -0.048957378 +-1.5351577 second factor -0.048957378 +-1.3895458 last factor -0.048957378 +-1.3899099 negative factor -0.048957378 +-1.913908 health factor -0.048957378 +-2.5599043 the majority -0.27955192 +-1.5956279 large majority -0.048957378 +-3.2085369 a graduate -0.048957378 +-3.1933482 of graduate -0.048957378 +-3.3489213 to graduate -0.048957378 +-1.7429656 they graduate -0.1722646 +-1.7054864 we graduate -0.048957378 +-3.2900462 , professional -0.048957378 +-2.0833616 any professional -0.048957378 +-3.0123634 in professional -0.048957378 +-2.8945246 their professional -0.048957378 +-2.0648875 become professional -0.048957378 +-2.016433 future professional -0.048957378 +-0.6932658 qualified professional -0.048957378 +-0.6932658 educated professional -0.048957378 +-1.5885828 develop professional -0.048957378 +-3.2979217 and transferable -0.048957378 +-2.2186792 the ability -0.2403142 +-3.428628 , ability -0.048957378 +-2.9787707 their ability -0.048957378 +-1.9171832 academic ability -0.048957378 +-2.920025 to communicate -0.048957378 +-2.6489563 with customers -0.048957378 +-2.9130242 for customers -0.048957378 +-2.7687883 the customers -0.048957378 +-2.3632026 of customers -0.04895735 +-1.7915213 restaurant customers -0.048957378 +-2.6645222 on customers -0.048957378 +-2.0724325 If customers -0.048957378 +-1.6818868 give customers -0.048957378 +-0.9925369 handling customers -0.048957378 +-1.386072 non-smoking customers -0.048957378 +-0.6929851 selfish customers -0.048957378 +-2.7969828 be responsible -0.048957378 +-2.7055588 a responsible -0.048957378 +-3.1159992 and responsible -0.048957378 +-1.8577744 being responsible -0.048957378 +-2.0711665 become responsible -0.048957378 +-0.99365675 institutions responsible -0.048957378 +-1.2928529 Being responsible -0.048957378 +-3.102048 is almost -0.048957378 +-2.8837602 are almost -0.048957378 +-1.2950919 given almost -0.048957378 +-1.3382984 academic achievement -0.187397 +-2.569465 Not -0.048957378 +-3.28005 of demonstrating -0.048957378 +-2.233106 these attributes -0.048957378 +-2.3701181 many aspects -0.048957378 +-1.966773 other aspects -0.048957378 +-2.285778 what aspects -0.048957378 +-2.250529 to enjoy -0.048957378 +-2.821667 they enjoy -0.048957378 +-2.434271 them enjoy -0.048957378 +-1.3101277 really enjoy -0.048957378 +-1.1696702 truly enjoy -0.048957378 +-3.4650414 to expect -0.048957378 +-2.941432 that once -0.048957378 +-3.1159992 and once -0.048957378 +-1.5340091 relationships once -0.048957378 +-2.3897834 life once -0.048957378 +-1.7266748 were once -0.048957378 +-1.5335816 universities once -0.048957378 +-1.1692609 begins once -0.048957378 +-2.4324028 Thirdly -0.50140065 +-2.4266567 a wider -0.04895735 +-1.1710447 wider scale -0.048957378 +-3.2908964 a necessity -0.048957378 +-3.2676826 of necessity -0.048957378 +-2.469975 an economical -0.048957378 +-2.4126248 a sense -0.13618872 +-2.0235913 better sense -0.048957378 +-0.6938276 economical sense -0.048957378 +-1.2948219 growing sense -0.048957378 +-0.6938276 false sense -0.048957378 +-1.7708699 Many businesses -0.048957378 +-1.1708648 allowing businesses -0.048957378 +-2.7659814 not require -0.048957378 +-0.99501497 businesses require -0.048957378 +-2.218951 work force -0.0796529 +-2.3904223 may force -0.048957378 +-2.3365319 would force -0.048957378 +-2.873293 be flexible -0.048957378 +-3.120598 is flexible -0.048957378 +-2.920025 to fill -0.048957378 +-2.097776 I would -0.04895735 +-2.3044932 this would -0.048957378 +-1.8330964 it would -0.04895735 +-2.0693212 that would -0.04895735 +-2.130787 job would -0.048957378 +-2.4572146 student would -0.048957378 +-2.7388577 , would -0.048957378 +-2.7421265 and would -0.048957378 +-1.7469382 study would -0.13618872 +-2.0694222 there would -0.048957378 +-2.0406728 students would -0.04895735 +-2.0217214 then would -0.048957378 +-1.7249643 they would -0.09038658 +-2.033479 This would -0.048957378 +-1.8405796 It would -0.048957378 +-2.308202 life would -0.048957378 +-1.6185179 finances would -0.048957378 +-1.2812839 That would -0.048957378 +-0.98780924 answer would -0.048957378 +-0.98780924 store would -0.048957378 +-1.5149469 Some would -0.048957378 +-1.5181254 books would -0.048957378 +-1.6573156 Students would -0.187397 +-1.3762115 she would -0.048957378 +-2.1898277 we would -0.048957378 +-1.3744649 choice would -0.048957378 +-1.371275 restaurants would -0.048957378 +-2.0774853 smoking would -0.048957378 +-1.6908859 full-time member -0.048957378 +-2.1499453 -RRB- member -0.048957378 +-0.5854695 contributing member -0.187397 +-3.28005 of staff -0.048957378 +-2.0329945 In contrast -0.048957378 +-2.08053 good arguments -0.048957378 +-2.223746 these arguments -0.048957378 +-1.5935729 both arguments -0.048957378 +-0.9944986 separate arguments -0.048957378 +-2.672149 can however -0.048957378 +-2.291106 , however -0.1722646 +-1.9625137 reason however -0.048957378 +-2.1961105 studies however -0.048957378 +-1.9135503 ; however -0.048957378 +-1.2934115 arguments however -0.048957378 +-3.4650414 to cope -0.187397 +-2.7665153 it depends -0.048957378 +-0.99501497 commitments depends -0.048957378 +-3.321535 the situation -0.048957378 +-2.1127527 This situation -0.048957378 +-1.7676849 personal situation -0.048957378 +-1.9410545 own situation -0.048957378 +-1.1710447 largely determined -0.048957378 +-2.013212 financial condition -0.048957378 +-3.1356838 a year -0.048957378 +-2.802924 the year -0.187397 +-2.9101167 their year -0.048957378 +-2.2788568 school year -0.048957378 +-1.3891406 final year -0.048957378 +-1.388153 every year -0.048957378 +-0.6934062 sophomore year -0.048957378 +-0.6934062 4th year -0.048957378 +-3.4000094 the type -0.048957378 +-1.5221592 any type -0.187397 +-2.3022652 school builds -0.048957378 +-3.5096595 , encourages -0.048957378 +-2.0120695 financial stake -0.048957378 +-0.99501497 `` stake -0.048957378 +-2.7951853 for education -0.048957378 +-2.9033272 the education -0.048957378 +-2.0239635 of education -0.04895735 +-2.079489 their education -0.40449065 +-2.4649103 from education -0.048957378 +-1.285946 quality education -0.048957378 +-2.230697 school education -0.048957378 +-1.8528848 an education -0.048957378 +-1.63323 college education -0.17557326 +-1.9939408 's education -0.048957378 +-2.0249267 good education -0.048957378 +-2.282163 fs education -0.048957378 +-2.0578392 much education -0.048957378 +-2.2976928 my education -0.048957378 +-1.2845579 higher education -0.048957378 +-0.9894719 advanced education -0.048957378 +-1.1625811 proper education -0.048957378 +-1.9203491 our education -0.048957378 +-1.2845579 further education -0.048957378 +-0.8040018 tertiary education -0.048957378 +-1.8836768 health education -0.048957378 +-0.69144475 incomplete education -0.048957378 +-2.45335 this prepares -0.048957378 +-3.2799814 and prepares -0.048957378 +-3.0543673 a world -0.048957378 +-1.8925264 the world -0.17892508 +-1.9649823 working world -0.048957378 +-2.7047815 work world -0.048957378 +-1.9002689 academic world -0.04895735 +-1.2174666 real world -0.07333779 +-2.3370237 fs world -0.048957378 +-1.5847367 adult world -0.048957378 +-0.99225736 ereal world -0.048957378 +-0.69284487 fascinating world -0.048957378 +-0.69284487 Disney world -0.048957378 +-0.69284487 greal world -0.048957378 +-2.7267487 will encounter -0.048957378 +-1.2958019 fre countless -0.048957378 +-2.880715 be acquired -0.048957378 +-2.1525254 most readily -0.048957378 +-0.6942867 readily apparent -0.048957378 +-2.709418 with balancing -0.048957378 +-3.428628 , balancing -0.048957378 +-3.089379 in balancing -0.048957378 +-3.2102208 and balancing -0.048957378 +-3.644728 Skills -0.048957378 +-1.8680823 like prioritization -0.048957378 +-3.5096595 , multitasking -0.048957378 +-3.428628 , finding -0.048957378 +-3.089379 in finding -0.048957378 +-3.21812 of finding -0.048957378 +-2.8944666 to finding -0.048957378 +-2.4483666 for success -0.048957378 +-3.1880362 the success -0.048957378 +-3.0267124 in success -0.048957378 +-1.1684108 considerable success -0.048957378 +-1.3347516 academic success -0.048957378 +-1.3895564 finding success -0.048957378 +-1.9314854 own success -0.048957378 +-0.6934062 achieve success -0.048957378 +-2.6254168 with going -0.048957378 +-2.843989 , going -0.187397 +-2.6392229 not going -0.048957378 +-2.6256745 and going -0.187397 +-2.641398 as going -0.048957378 +-2.2094445 time going -0.048957378 +-2.064076 then going -0.048957378 +-1.8155167 : going -0.048957378 +-2.5824351 or going -0.048957378 +-2.726998 are going -0.048957378 +-2.2914126 when going -0.048957378 +-2.0492468 while going -0.187397 +-1.7513353 always going -0.048957378 +-1.1659027 Balancing going -0.048957378 +-2.7329264 job provides -0.048957378 +-2.4294076 experience provides -0.048957378 +-2.5355608 working provides -0.048957378 +-2.392312 also provides -0.048957378 +-2.7264938 college provides -0.048957378 +-1.727343 still provides -0.048957378 +-3.2908964 a perfect -0.048957378 +-2.878185 the perfect -0.048957378 +-2.045974 valuable training -0.048957378 +-3.321535 the training -0.048957378 +-2.0674677 because training -0.048957378 +-1.1700908 perfect training -0.048957378 +-1.2956754 training ground -0.048957378 +-1.1708648 middle ground -0.048957378 +-2.1471612 will improve -0.048957378 +-2.90557 to improve -0.048957378 +-2.2294533 help improve -0.048957378 +-2.569465 Too -0.11268411 +-3.321535 the burden -0.048957378 +-2.0071845 financial burden -0.048957378 +-1.1700908 proper burden -0.048957378 +-0.69396824 unnecessary burden -0.048957378 +-3.120598 is supported -0.048957378 +-1.6465397 fully supported -0.048957378 +-3.3489213 to entirely -0.048957378 +-2.3983316 also entirely -0.048957378 +-0.9942178 supported entirely -0.048957378 +-0.9942178 herself entirely -0.048957378 +-1.5672618 smoking entirely -0.048957378 +-3.4204583 the consequence -0.048957378 +-3.4000094 the concept -0.048957378 +-1.8947358 little concept -0.187397 +-2.8665538 the actual -0.048957378 +-3.244391 of actual -0.048957378 +-2.3673859 The actual -0.048957378 +-2.1115448 the cost -0.5805819 +-2.3479426 The cost -0.048957378 +-1.5913469 true cost -0.048957378 +-1.2937762 actual cost -0.048957378 +-1.7684674 full cost -0.048957378 +-1.2934115 labor cost -0.048957378 +-3.09512 Without -0.048957378 +-2.7747319 job `` -0.048957378 +-2.0120695 financial `` -0.048957378 +-2.5535734 working '' -0.048957378 +-0.9947796 stake '' -0.048957378 +-1.5375328 means '' -0.048957378 +-2.4042459 a place -0.13618872 +-2.9262898 their place -0.048957378 +-2.5331511 should place -0.048957378 +-2.0375447 first place -0.048957378 +-1.7258272 n't place -0.048957378 +-1.2928529 another place -0.048957378 +-1.467546 takes place -0.048957378 +-2.3444166 the importance -0.30637136 +-2.3479426 The importance -0.048957378 +-1.9365996 no importance -0.048957378 +-1.2934115 its importance -0.048957378 +-0.9939372 utmost importance -0.048957378 +-0.69368714 paramount importance -0.048957378 +-2.1046777 that were -0.04895735 +-2.784949 they were -0.048957378 +-2.1421049 you were -0.048957378 +-1.6380051 There were -0.048957378 +-2.100664 parents were -0.048957378 +-1.4660097 cards were -0.048957378 +-2.3114288 would were -0.048957378 +-1.1679918 Others were -0.048957378 +-0.9930965 % were -0.048957378 +-1.922429 classes coming -0.048957378 +-2.7927094 and out -0.048957378 +-2.0837967 most out -0.048957378 +-2.4962344 or out -0.048957378 +-1.7789398 taking out -0.048957378 +-2.200524 get out -0.048957378 +-1.8521662 going out -0.048957378 +-0.69116527 coming out -0.048957378 +-1.1799238 go out -0.048957378 +-1.4531673 far out -0.048957378 +-1.2849771 try out -0.048957378 +-0.988917 round out -0.048957378 +-0.988917 test out -0.048957378 +-0.40829948 hang out -0.187397 +-0.69116527 figuring out -0.048957378 +-0.69116527 pointed out -0.048957378 +-0.988917 carry out -0.048957378 +-0.69116527 burn out -0.048957378 +-0.69116527 move out -0.048957378 +-0.69116527 strike out -0.048957378 +-0.69116527 fresh out -0.048957378 +-0.69116527 filling out -0.048957378 +-0.69116527 shell out -0.048957378 +-0.988917 dropped out -0.048957378 +-0.69116527 drop out -0.048957378 +-1.4151593 their own -0.048957378 +-1.2521647 your own -0.048957378 +-2.3736084 fs own -0.048957378 +-2.378489 my own -0.048957378 +-1.9589672 our own -0.048957378 +-1.466047 his own -0.048957378 +-1.9465656 own pockets -0.048957378 +-2.328378 That -0.13618872 +-2.4799736 is where -0.048957378 +-2.565259 , where -0.04895735 +-2.8984869 and where -0.048957378 +-2.9718513 of where -0.048957378 +-3.0788198 to where -0.048957378 +-1.2873054 internships where -0.048957378 +-1.8369389 environment where -0.048957378 +-2.3087277 fs where -0.048957378 +-1.5803545 place where -0.048957378 +-0.6921442 environments where -0.048957378 +-1.7130257 lives where -0.048957378 +-1.5814073 point where -0.048957378 +-0.6921442 departments where -0.048957378 +-0.99086237 meetings where -0.048957378 +-1.526558 choose where -0.048957378 +-0.6921442 colleagues where -0.048957378 +-1.2873054 places where -0.048957378 +-2.7044797 can contribute -0.048957378 +-2.9150915 to contribute -0.187397 +-2.9982424 that giving -0.048957378 +-3.4625692 , giving -0.048957378 +-2.4133134 by giving -0.048957378 +-1.9028327 the value -0.7037435 +-1.4687891 educational value -0.048957378 +-1.5924584 true value -0.048957378 +-1.2205254 real value -0.048957378 +-0.6938276 promotes value -0.048957378 +-2.0367153 often find -0.048957378 +-2.0624037 to find -0.07613316 +-2.3910592 students find -0.048957378 +-2.5331511 should find -0.048957378 +-2.6549041 or find -0.048957378 +-2.1040707 even find -0.048957378 +-1.7258272 still find -0.048957378 +-2.4342768 this transition -0.048957378 +-2.9939594 for transition -0.048957378 +-2.839973 the transition -0.048957378 +-3.3489213 to transition -0.048957378 +-1.6908499 difficult transition -0.048957378 +-3.321535 the campus -0.048957378 +-2.5811288 from campus -0.048957378 +-1.8265634 on campus -0.13618872 +-2.7404244 college campus -0.048957378 +-1.647248 College campuses -0.048957378 +-2.9060166 are unique -0.048957378 +-0.6942867 unique environments -0.048957378 +-3.4204583 the passion -0.048957378 +-2.5958927 with learning -0.048957378 +-2.4080775 for learning -0.048957378 +-2.4748268 is learning -0.048957378 +-2.6982417 the learning -0.048957378 +-2.5608099 , learning -0.04895735 +-2.590611 and learning -0.048957378 +-2.9576278 of learning -0.048957378 +-2.4881823 from learning -0.048957378 +-2.6136355 as learning -0.048957378 +-2.123871 on learning -0.048957378 +-2.6759744 time learning -0.048957378 +-2.681605 are learning -0.048957378 +-1.5782256 been learning -0.048957378 +-1.5782256 great learning -0.048957378 +-1.2867545 higher learning -0.048957378 +-1.5804508 point learning -0.048957378 +-0.99058396 mention learning -0.048957378 +-0.99058396 Whether learning -0.048957378 +-3.2979217 and discovery -0.048957378 +-2.9060166 are cultivated -0.048957378 +-3.4479775 to continue -0.048957378 +-1.4711709 colleges continue -0.048957378 +-3.4650414 to maintain -0.048957378 +-2.1279233 can explore -0.048957378 +-2.3912728 to explore -0.048957378 +-1.7695215 To explore -0.048957378 +-1.5382957 explore talents -0.048957378 +-3.284121 the interests -0.048957378 +-2.3590646 many interests -0.048957378 +-3.1764908 and interests -0.048957378 +-1.8613404 new interests -0.048957378 +-0.9942178 changing interests -0.048957378 +-2.3960257 a full -0.04895735 +-2.7912447 the full -0.048957378 +-2.3377671 many full -0.048957378 +-3.1063578 of full -0.048957378 +-2.8945246 their full -0.048957378 +-2.5182657 working full -0.048957378 +-2.7301192 work full -0.048957378 +-2.7064013 college full -0.048957378 +-1.5885828 enter full -0.048957378 +-1.7720404 full extent -0.048957378 +-1.1708648 certain extent -0.048957378 +-3.1669025 , without -0.048957378 +-2.3548002 by without -0.048957378 +-2.061771 -LRB- without -0.048957378 +-1.899711 career without -0.048957378 +-1.8408649 environment without -0.048957378 +-1.1654861 abilities without -0.048957378 +-1.9641547 enough without -0.048957378 +-1.632501 earn without -0.048957378 +-1.9863565 society without -0.048957378 +-1.8443289 family without -0.048957378 +-2.2087579 education without -0.048957378 +-1.985344 things without -0.048957378 +-1.525456 us without -0.048957378 +-1.4604971 live without -0.048957378 +-0.69242436 meal without -0.048957378 +-2.7659814 not worrying -0.048957378 +-1.8672849 without worrying -0.048957378 +-2.4799736 is about -0.048957378 +-2.275306 part about -0.048957378 +-1.58455 knowledge about -0.048957378 +-2.373153 all about -0.048957378 +-2.377569 them about -0.048957378 +-2.0504048 learning about -0.048957378 +-0.40868983 worrying about -0.048957378 +-1.6141546 learn about -0.048957378 +-0.95031416 lessons about -0.187397 +-2.072689 much about -0.048957378 +-1.2883778 insight about -0.048957378 +-0.99086237 brought about -0.048957378 +-1.1646541 studied about -0.048957378 +-1.8130639 right about -0.048957378 +-0.99086237 thinking about -0.048957378 +-0.6921442 worried about -0.048957378 +-0.6921442 worry about -0.04895735 +-2.7044797 can translate -0.048957378 +-2.3941946 may translate -0.048957378 +-2.7556913 not directly -0.048957378 +-0.9947796 translate directly -0.048957378 +-0.69410884 ones directly -0.048957378 +-3.3047535 a salable -0.048957378 +-0.6942867 salable product -0.048957378 +-1.2958019 particularly ones -0.048957378 +-2.0510054 often expose -0.048957378 +-3.4479775 to expose -0.048957378 +-2.7368789 I partially -0.048957378 +-2.2432292 only partially -0.048957378 +-3.0178597 that awaits -0.048957378 +-3.09512 Should -0.048957378 +-2.3729722 The answer -0.048957378 +-0.6942267 yes-or-no answer -0.048957378 +-2.45335 this question -0.048957378 +-3.1215048 in question -0.048957378 +-3.1303647 is subjective -0.048957378 +-1.8381437 A blanket -0.048957378 +-3.5096595 , yes-or-no -0.048957378 +-3.4479775 to overlook -0.048957378 +-2.3401537 would overlook -0.048957378 +-2.9939594 for various -0.048957378 +-3.284121 the various -0.048957378 +-2.434271 them various -0.048957378 +-1.766614 enjoy various -0.048957378 +-2.0805624 out various -0.048957378 +-1.6908859 These factors -0.048957378 +-1.1705118 relevant factors -0.048957378 +-1.6498554 these factors -0.048957378 +-2.6912675 with different -0.048957378 +-3.1828704 a different -0.048957378 +-1.4679527 completely different -0.048957378 +-2.1518767 very different -0.187397 +-2.7526727 have different -0.048957378 +-2.3174286 are different -0.187397 +-3.2358162 a limited -0.048957378 +-3.0808864 is limited -0.048957378 +-3.321535 the limited -0.048957378 +-2.7437143 not limited -0.048957378 +-1.7200686 school supplies -0.187397 +-3.4204583 the probable -0.048957378 +-3.284121 the impact -0.048957378 +-0.6938276 probable impact -0.048957378 +-1.5359479 beneficial impact -0.048957378 +-0.80680114 negative impact -0.048957378 +-0.6938276 negatively impact -0.048957378 +-2.749941 and energy -0.048957378 +-3.014355 their energy -0.048957378 +-0.995135 resulting reduction -0.048957378 +-3.014355 their effectiveness -0.048957378 +-2.3987331 fs effectiveness -0.048957378 +-3.2979217 and completing -0.048957378 +-2.9787707 their assignments -0.048957378 +-2.6900918 or assignments -0.048957378 +-0.69396824 completing assignments -0.048957378 +-1.8356878 class assignments -0.048957378 +-3.4204583 the relevance -0.048957378 +-3.644728 Considerations -0.048957378 +-0.6942867 Considerations pertaining -0.048957378 +-2.9060166 are fairly -0.048957378 +-0.6942267 fairly straightforward -0.048957378 +-0.6942267 pretty straightforward -0.048957378 +-2.9652722 for either -0.048957378 +-3.022964 is either -0.048957378 +-2.6044638 student either -0.048957378 +-3.2905889 to either -0.048957378 +-2.5594692 from either -0.048957378 +-2.2156067 help either -0.048957378 +-1.7627151 always either -0.048957378 +-2.7481966 it requires -0.048957378 +-1.8906534 It requires -0.048957378 +-1.536077 either requires -0.048957378 +-0.69396824 Training requires -0.048957378 +-2.64483 I support -0.048957378 +-2.6259773 can support -0.048957378 +-2.8795009 that support -0.048957378 +-2.8613229 , support -0.048957378 +-2.233448 to support -0.048957378 +-2.6019366 or support -0.048957378 +-2.3405304 do support -0.048957378 +-1.6343127 fully support -0.048957378 +-0.99225736 firmly support -0.048957378 +-1.1667371 shall support -0.048957378 +-0.69284487 emphatically support -0.048957378 +-0.69284487 adequately support -0.048957378 +-2.7796392 job rises -0.048957378 +-3.2979217 and falls -0.048957378 +-3.1293185 in relation -0.048957378 +-2.7198546 it second -0.048957378 +-3.1586366 a second -0.048957378 +-3.2177634 the second -0.048957378 +-3.340304 , second -0.048957378 +-2.6948016 and second -0.187397 +-2.34165 The second -0.048957378 +-1.467118 My second -0.187397 +-2.6891787 I never -0.048957378 +-2.677301 will never -0.048957378 +-1.8855287 have never -0.04895735 +-1.4683348 say never -0.048957378 +-2.1635911 could never -0.048957378 +-2.3196363 would never -0.048957378 +-1.8603482 had never -0.048957378 +-2.2078578 is too -0.04895735 +-2.8525696 , too -0.187397 +-2.6486106 not too -0.048957378 +-2.6504343 on too -0.048957378 +-1.6346928 bad too -0.048957378 +-1.9846538 are too -0.04895735 +-2.089379 parents too -0.048957378 +-2.3936956 all too -0.048957378 +-2.065762 If too -0.048957378 +-2.3648698 life too -0.048957378 +-1.9902992 where too -0.048957378 +-1.633089 But too -0.048957378 +-1.5836447 perhaps too -0.048957378 +-2.408415 a great -0.04895735 +-3.0414262 is great -0.048957378 +-3.2496762 the great -0.048957378 +-2.0399847 make great -0.048957378 +-1.9163997 too great -0.048957378 +-0.69368714 create great -0.048957378 +-2.1035345 -LRB- relative -0.048957378 +-2.3779159 important relative -0.048957378 +-2.031715 In addition -0.20946135 +-2.0302927 in addition -0.20946135 +-3.0230536 their scholastic -0.048957378 +-2.7556913 not keep -0.048957378 +-2.410627 also keep -0.048957378 +-2.0498173 must keep -0.048957378 +-2.8665538 the last -0.048957378 +-2.7143507 will last -0.048957378 +-2.3673859 The last -0.048957378 +-1.894961 less complicated -0.048957378 +-2.1511004 most complicated -0.048957378 +-3.2908964 a determination -0.048957378 +-1.5956279 true determination -0.048957378 +-2.8497334 they stand -0.048957378 +-1.9687632 our stand -0.048957378 +-2.983328 that particular -0.048957378 +-2.7312138 a particular -0.048957378 +-2.0984125 any particular -0.048957378 +-3.089379 in particular -0.048957378 +-3.2908964 a convenience -0.187397 +-2.5303967 at convenience -0.048957378 +-0.995135 convenience store -0.048957378 +-3.3144484 , someone -0.048957378 +-3.2641199 to someone -0.048957378 +-2.082634 If someone -0.048957378 +-1.7639768 go someone -0.048957378 +-1.292295 serve someone -0.048957378 +-1.292295 determine someone -0.048957378 +-1.292295 over someone -0.048957378 +-1.2927897 efforts someone -0.048957378 +-3.1586366 a business -0.048957378 +-2.0375323 valuable business -0.048957378 +-3.340304 , business -0.048957378 +-2.192501 studying business -0.048957378 +-2.6549041 or business -0.048957378 +-2.0182273 about business -0.048957378 +-0.99365675 ' business -0.048957378 +-3.644728 Each -0.048957378 +-3.4163227 to weigh -0.048957378 +-2.5549836 should weigh -0.048957378 +-2.0498173 must weigh -0.048957378 +-2.7398808 and decide -0.048957378 +-3.4163227 to decide -0.048957378 +-2.84126 they decide -0.048957378 +-3.3971481 , based -0.048957378 +-2.7320588 not based -0.048957378 +-0.9942178 theory based -0.048957378 +-1.293971 purely based -0.048957378 +-0.6938276 formula based -0.048957378 +-3.4204583 the particulars -0.048957378 +-3.5692832 feel -0.048957378 +-1.6312175 I feel -0.099168696 +-3.2102208 and feel -0.048957378 +-2.4634845 people feel -0.048957378 +-2.1834567 it allows -0.13618872 +-2.489963 part-time allows -0.048957378 +-2.1920223 job allows -0.048957378 +-2.105565 This allows -0.048957378 +-1.8862529 It allows -0.048957378 +-0.9939372 wage allows -0.048957378 +-2.6862712 will start -0.048957378 +-2.5737934 to start -0.04895735 +-1.9432847 they start -0.04895735 +-2.272803 we start -0.048957378 +-0.9939372 head start -0.048957378 +-0.69368714 smooth start -0.048957378 +-1.33811 ; building -0.048957378 +-1.6926183 start building -0.048957378 +-3.3047535 a strong -0.048957378 +-2.2223525 work ethic -0.048957378 +-3.2799814 and ease -0.048957378 +-1.651054 help ease -0.048957378 +-3.644728 Young -0.048957378 +-3.106568 in subjects -0.048957378 +-2.9977748 their subjects -0.048957378 +-1.9193444 academic subjects -0.048957378 +-2.103136 can learn -0.04895735 +-2.119264 will learn -0.048957378 +-3.1851513 , learn -0.048957378 +-2.9531138 and learn -0.048957378 +-1.714014 to learn -0.10807813 +-2.8548293 students learn -0.048957378 +-1.4991031 also learn -0.04895735 +-2.4974015 should learn -0.048957378 +-1.9332619 they learn -0.04895735 +-1.9984559 better learn -0.048957378 +-1.134821 must learn -0.04895735 +-1.8699213 really learn -0.048957378 +-1.6757944 Students learn -0.048957378 +-2.2407663 we learn -0.048957378 +-2.0375323 valuable lessons -0.048957378 +-1.9993248 financial lessons -0.048957378 +-1.3888488 practical lessons -0.048957378 +-2.35946 important lessons -0.048957378 +-1.5343688 beneficial lessons -0.048957378 +-1.9132066 those lessons -0.048957378 +-1.1688302 attend lessons -0.048957378 +-2.859086 be successful -0.187397 +-2.7401137 a successful -0.187397 +-1.2950919 build successful -0.048957378 +-1.693077 start saving -0.048957378 +-3.1828704 a free -0.048957378 +-1.519052 any free -0.187397 +-2.434089 their free -0.187397 +-2.3401244 study free -0.048957378 +-1.16925 2 free -0.048957378 +-0.69368714 single free -0.048957378 +-2.7309246 with structure -0.048957378 +-3.4650414 to divide -0.048957378 +-3.106568 in areas -0.048957378 +-2.449753 all areas -0.048957378 +-1.1705118 country areas -0.048957378 +-2.7024844 or contacts -0.048957378 +-1.645653 professional contacts -0.048957378 +-1.5375328 business contacts -0.048957378 +-2.5531027 is best -0.048957378 +-2.5476494 the best -0.04895735 +-2.9787707 their best -0.048957378 +-2.3608074 The best -0.048957378 +-2.6507893 student intends -0.048957378 +-3.4650414 to pursue -0.048957378 +-2.880715 be exhausting -0.048957378 +-2.1137996 So -0.105123706 +-2.9179077 , let -0.048957378 +-2.720708 not let -0.048957378 +-3.1451926 and let -0.048957378 +-3.3187766 to let -0.048957378 +-2.5385072 should let -0.048957378 +-1.8297838 And let -0.048957378 +-2.7481966 it becomes -0.048957378 +-2.7542121 job becomes -0.048957378 +-0.9944986 Time becomes -0.048957378 +-2.0472798 smoke becomes -0.048957378 +-1.2958019 becomes time-consuming -0.048957378 +-3.120598 is stressful -0.048957378 +-2.7131412 or stressful -0.048957378 +-3.0230536 their priorities -0.048957378 +-0.6942867 priorities straight -0.048957378 +-2.983328 that quit -0.048957378 +-2.387845 to quit -0.048957378 +-2.5494218 should quit -0.048957378 +-1.3911768 likely quit -0.048957378 +-2.9155905 that rather -0.048957378 +-3.1138833 a rather -0.048957378 +-2.18644 studying rather -0.048957378 +-2.4667242 jobs rather -0.048957378 +-1.5880291 potential rather -0.048957378 +-1.9540708 themselves rather -0.048957378 +-2.0013952 need rather -0.048957378 +-1.3874582 mind rather -0.048957378 +-0.9930965 players rather -0.048957378 +-3.2042007 , than -0.048957378 +-1.8727539 less than -0.048957378 +-2.6809804 college than -0.048957378 +-0.991978 competitive than -0.048957378 +-1.9030889 career than -0.048957378 +-1.2995975 more than -0.099168696 +-2.3376076 important than -0.048957378 +-0.12995242 rather than -0.048957378 +-0.6927047 specialisation than -0.048957378 +-1.4644498 problems than -0.048957378 +-0.6927047 story than -0.048957378 +-0.6927047 possibility than -0.048957378 +-1.9453988 restaurants than -0.048957378 +-3.4479775 to sacrifice -0.048957378 +-2.0508933 than sacrifice -0.048957378 +-3.09512 Despite -0.048957378 +-3.4000094 the risks -0.048957378 +-1.9214826 health risks -0.048957378 +-2.0939517 -LRB- though -0.048957378 +-2.110595 even though -0.048957378 +-1.6438699 others though -0.048957378 +-0.9942178 risks though -0.048957378 +-1.293971 Even though -0.048957378 +-2.2877035 be beneficial -0.187397 +-2.7226624 job beneficial -0.048957378 +-3.022964 is beneficial -0.048957378 +-3.340304 , beneficial -0.048957378 +-3.1159992 and beneficial -0.048957378 +-2.1487668 but beneficial -0.048957378 +-1.1688302 personally beneficial -0.048957378 +-3.4650414 to imply -0.048957378 +-2.710804 it just -0.048957378 +-3.0052547 is just -0.048957378 +-2.1770935 not just -0.048957378 +-2.8054512 are just -0.048957378 +-2.3610678 fs just -0.048957378 +-1.7962554 responsibilities just -0.048957378 +-1.8269229 income just -0.048957378 +-0.6934062 tried just -0.048957378 +-1.2541232 extra pocket -0.187397 +-2.7131412 or pocket -0.048957378 +-2.2439 Some -0.08422062 +-1.8373363 : Some -0.048957378 +-3.340304 , possible -0.048957378 +-2.5594692 from possible -0.048957378 +-1.8268409 as possible -0.04895735 +-2.4264548 all possible -0.048957378 +-2.0860887 If possible -0.048957378 +-1.5331545 quite possible -0.048957378 +-1.7627151 always possible -0.048957378 +-2.1262298 for those -0.04895735 +-2.9937658 and those -0.048957378 +-1.9401484 of those -0.25144583 +-2.8145835 to those -0.048957378 +-1.7545563 For those -0.048957378 +-1.4637157 example those -0.048957378 +-2.52561 from those -0.048957378 +-2.1988974 help those -0.048957378 +-1.5862222 earning those -0.048957378 +-2.0217173 than those -0.048957378 +-1.1667371 behind those -0.048957378 +-0.69284487 Beware those -0.048957378 +-3.284121 the final -0.048957378 +-2.9605632 their final -0.048957378 +-2.109144 This final -0.048957378 +-2.3840747 my final -0.048957378 +-1.4687891 My final -0.048957378 +-2.8554769 for years -0.048957378 +-1.6768808 several years -0.048957378 +-3.0035503 the years -0.048957378 +-2.1511497 college years -0.048957378 +-0.805253 final years -0.187397 +-0.9911411 precious years -0.048957378 +-1.6343536 later years -0.048957378 +-1.4596765 few years -0.048957378 +-0.5842702 four years -0.048957378 +-0.6922843 14 years -0.048957378 +-1.1650699 3 years -0.048957378 +-0.9911411 4 years -0.048957378 +-0.6922843 senior years -0.048957378 +-0.6922843 16 years -0.048957378 +-1.1650699 40 years -0.048957378 +-0.6922843 45 years -0.048957378 +-2.9583905 , medicine -0.048957378 +-3.2676826 of medicine -0.048957378 +-2.6824691 with three -0.048957378 +-1.7627151 For three -0.048957378 +-2.7428439 have three -0.13618872 +-1.060789 following three -0.187397 +-2.8200238 are three -0.048957378 +-1.6866118 give three -0.048957378 +-1.1688302 Reason three -0.048957378 +-3.2908964 a lesson -0.048957378 +-2.4109652 life lesson -0.048957378 +-2.63334 can before -0.048957378 +-3.2450058 , before -0.048957378 +-3.0156138 and before -0.048957378 +-2.1980197 work before -0.187397 +-2.5026238 money before -0.048957378 +-2.0099201 activities before -0.048957378 +-2.3730166 life before -0.048957378 +-1.4657409 used before -0.048957378 +-1.2906253 lifestyle before -0.048957378 +-1.5877644 year before -0.048957378 +-1.9283917 years before -0.048957378 +-2.5843997 to enter -0.04895735 +-1.945827 they enter -0.04895735 +-1.5363102 graduates enter -0.048957378 +-2.2811956 we enter -0.048957378 +-2.1888087 it taste -0.187397 +-3.2358162 a taste -0.048957378 +-2.85306 the taste -0.187397 +-2.045981 first taste -0.048957378 +-2.6641047 can come -0.048957378 +-3.2905889 to come -0.048957378 +-2.9227278 students come -0.048957378 +-2.3728426 may come -0.048957378 +-2.2701116 who come -0.048957378 +-2.3196363 would come -0.048957378 +-1.1688302 shall come -0.048957378 +-3.246793 and manage -0.048957378 +-2.90557 to manage -0.04895735 +-1.731922 n't manage -0.048957378 +-2.920025 to secure -0.187397 +-1.8612937 it means -0.11268411 +-3.284121 the means -0.048957378 +-2.762729 have means -0.048957378 +-2.1987321 which means -0.048957378 +-0.6938276 Independent means -0.048957378 +-2.7659814 not relying -0.048957378 +-2.0508933 than relying -0.048957378 +-2.4177067 by organizations -0.048957378 +-2.5949488 from organizations -0.048957378 +-2.569465 Perhaps -0.048957378 +-3.09512 Lastly -0.187397 +-2.1525633 world lying -0.048957378 +-0.69410884 lying beyond -0.048957378 +-0.9947796 worlds beyond -0.048957378 +-0.69410884 Anything beyond -0.048957378 +-2.7567537 college gates -0.048957378 +-2.5801177 student outside -0.048957378 +-2.4608476 jobs outside -0.048957378 +-2.411885 an outside -0.048957378 +-2.346583 skills outside -0.048957378 +-1.5875456 learned outside -0.048957378 +-2.426685 people outside -0.048957378 +-2.3771482 life outside -0.048957378 +-1.8603683 living outside -0.048957378 +-1.4646233 live outside -0.048957378 +-1.1681993 opinions outside -0.048957378 +-2.1524544 social circles -0.048957378 +-2.6954308 will meet -0.048957378 +-3.1764908 and meet -0.048957378 +-2.384444 to meet -0.048957378 +-2.380017 fs meet -0.048957378 +-0.6938276 undoubtedly meet -0.048957378 +-2.673552 a new -0.048957378 +-2.7687883 the new -0.048957378 +-2.0880997 This new -0.048957378 +-1.1056151 something new -0.048957378 +-2.0651627 learning new -0.048957378 +-1.5292882 explore new -0.048957378 +-1.0087806 meet new -0.048957378 +-1.1671549 discover new -0.048957378 +-1.9465262 our new -0.048957378 +-1.2913141 creating new -0.048957378 +-0.6929851 encountering new -0.048957378 +-2.7665153 it offers -0.048957378 +-2.2212014 work offers -0.048957378 +-2.880715 be considered -0.048957378 +-2.7556167 be against -0.048957378 +-3.2669423 , against -0.048957378 +-1.4646233 completely against -0.048957378 +-2.4099944 experience against -0.048957378 +-2.1054301 there against -0.048957378 +-2.0274923 's against -0.048957378 +-1.5302515 am against -0.13618872 +-0.6931254 considered against -0.048957378 +-0.99281657 absolutely against -0.048957378 +-2.0326366 smoke against -0.048957378 +-1.8640413 course load -0.048957378 +-2.7759395 work load -0.048957378 +-1.7703259 full load -0.048957378 +-0.9944986 heavy load -0.048957378 +-2.6721168 be much -0.048957378 +-2.8253093 that much -0.048957378 +-2.6295722 a much -0.048957378 +-2.1969292 is much -0.04895735 +-3.1325557 , much -0.048957378 +-2.9119923 in much -0.048957378 +-1.6189238 as much -0.11268411 +-1.1700515 so much -0.11268411 +-2.0406418 become much -0.048957378 +-1.8369389 environment much -0.048957378 +-1.914554 provide much -0.048957378 +-1.1646541 Too much -0.048957378 +-0.80194837 too much -0.048957378 +-1.5246081 universities much -0.048957378 +-0.99086237 dating much -0.048957378 +-0.6921442 stem much -0.048957378 +-0.6921442 cared much -0.048957378 +-2.3941946 may hinder -0.048957378 +-2.0508933 than hinder -0.048957378 +-2.9787707 their performance -0.048957378 +-1.9171832 academic performance -0.048957378 +-0.69396824 improved performance -0.048957378 +-0.69396824 efficient performance -0.048957378 +-2.9793792 for books -0.048957378 +-3.3677967 , books -0.048957378 +-2.434089 their books -0.048957378 +-2.5665698 from books -0.048957378 +-1.2934115 needed books -0.048957378 +-0.9939372 purchase books -0.048957378 +-3.3047535 a hindrance -0.048957378 +-2.1511729 I worked -0.04895735 +-3.1159992 and worked -0.048957378 +-2.2226734 only worked -0.048957378 +-2.7428439 have worked -0.048957378 +-1.1688302 myself worked -0.048957378 +-1.0609009 never worked -0.048957378 +-1.8844885 really worked -0.048957378 +-2.076076 with my -0.048957378 +-2.0822835 for my -0.04895735 +-2.395548 is my -0.048957378 +-2.906121 , my -0.048957378 +-1.9430684 In my -0.048957378 +-1.9844925 in my -0.048957378 +-2.1217492 of my -0.048957378 +-2.643432 to my -0.048957378 +-1.6044186 on my -0.048957378 +-1.8192497 pay my -0.048957378 +-2.4177666 or my -0.048957378 +-2.1998005 are my -0.048957378 +-1.7658334 taking my -0.048957378 +-1.5602907 put my -0.048957378 +-2.2279565 when my -0.048957378 +-0.98587745 Of my -0.048957378 +-1.8271078 all my -0.048957378 +-2.2198558 fs my -0.048957378 +-2.044196 up my -0.048957378 +-1.9464681 about my -0.048957378 +-0.9990649 support my -0.13618872 +-1.6504215 worked my -0.048957378 +-1.5662853 did my -0.048957378 +-1.6483629 balance my -0.048957378 +-1.6111709 spent my -0.048957378 +-1.8192497 had my -0.048957378 +-1.5623918 spending my -0.048957378 +-0.6896313 During my -0.048957378 +-0.6896313 organize my -0.048957378 +-0.98587745 reduces my -0.048957378 +-0.98587745 throughout my -0.048957378 +-0.98587745 remember my -0.048957378 +-0.98587745 versus my -0.048957378 +-0.6896313 voice my -0.048957378 +-0.6896313 jeopardize my -0.048957378 +-1.8306495 I did -0.04895735 +-2.720708 not did -0.048957378 +-1.2934115 nor did -0.048957378 +-1.8862529 It did -0.048957378 +-1.3895458 certainly did -0.048957378 +-1.2934115 finally did -0.048957378 +-1.8229574 I really -0.04895735 +-2.0224233 often really -0.048957378 +-3.1719224 to really -0.048957378 +-2.2024024 only really -0.048957378 +-1.8995479 ft really -0.048957378 +-1.7566372 food really -0.048957378 +-1.7183273 n't really -0.048957378 +-2.3370237 fs really -0.048957378 +-1.5283269 am really -0.048957378 +-2.1197996 world really -0.048957378 +-1.0594378 never really -0.048957378 +-1.3861309 she really -0.048957378 +-1.9802761 through until -0.048957378 +-1.5914499 position until -0.048957378 +-1.8312322 way until -0.048957378 +-1.1692609 diligent until -0.048957378 +-1.2928529 h until -0.048957378 +-0.40924802 wait until -0.187397 +-1.9584727 restaurants until -0.048957378 +-1.3925304 last weeks -0.048957378 +-1.1708648 lower workload -0.048957378 +-2.4003265 my workload -0.048957378 +-1.304172 I was -0.07613316 +-2.710804 it was -0.048957378 +-3.0886452 and was -0.048957378 +-2.7389047 work was -0.048957378 +-2.189597 which was -0.048957378 +-2.228456 education was -0.048957378 +-0.9933765 workload was -0.048957378 +-1.762938 he was -0.048957378 +-1.8678414 was huge -0.048957378 +-2.6562068 can go -0.048957378 +-2.698859 not go -0.048957378 +-3.0886452 and go -0.048957378 +-2.060793 to go -0.13332285 +-2.9123535 students go -0.048957378 +-2.1454496 you go -0.048957378 +-1.4654101 must go -0.187397 +-1.5321847 actually go -0.048957378 +-2.7368789 I conclude -0.048957378 +-3.4479775 to conclude -0.048957378 +-2.4930472 part-time now -0.048957378 +-3.0607083 is now -0.048957378 +-2.4571307 people now -0.048957378 +-0.9942178 conclude now -0.048957378 +-1.4696376 adults now -0.048957378 +-2.0587354 student needs -0.13618872 +-1.8282994 cases needs -0.048957378 +-1.9967362 financial needs -0.187397 +-1.4667772 different needs -0.048957378 +-1.8823773 really needs -0.048957378 +-1.388153 basic needs -0.048957378 +-1.8562627 Japan needs -0.048957378 +-1.388153 his\/her needs -0.048957378 +-2.5346918 the costs -0.187397 +-3.1451926 and costs -0.048957378 +-2.1955636 studying costs -0.048957378 +-1.7661085 tuition costs -0.048957378 +-0.9939372 increasing costs -0.048957378 +-2.2342522 education costs -0.048957378 +-3.2649243 a certain -0.048957378 +-2.0292141 In certain -0.048957378 +-3.4163227 to certain -0.048957378 +-2.66203 I understand -0.187397 +-2.678057 not understand -0.048957378 +-2.8332193 to understand -0.048957378 +-2.8923202 students understand -0.048957378 +-2.3690348 also understand -0.048957378 +-2.7762349 they understand -0.048957378 +-2.009448 better understand -0.048957378 +-2.153782 could understand -0.048957378 +-2.3073826 would understand -0.048957378 +-1.6367708 fully understand -0.048957378 +-3.0111382 that matter -0.048957378 +-3.2908964 a matter -0.048957378 +-2.09718 spend every -0.048957378 +-2.4571307 people every -0.048957378 +-1.293971 Not every -0.048957378 +-1.8618832 Japan every -0.048957378 +-1.6432748 me every -0.048957378 +-2.680345 can afford -0.048957378 +-2.1868148 not afford -0.048957378 +-3.3489213 to afford -0.048957378 +-2.2224748 help afford -0.048957378 +-1.8887417 really afford -0.048957378 +-1.5383809 afford maintaining -0.048957378 +-3.0230536 their son -0.048957378 +-2.7186737 or daughter -0.048957378 +-3.4479775 to dedicate -0.048957378 +-1.4711709 entirely dedicate -0.048957378 +-2.0457513 make him -0.048957378 +-1.9176081 support him -0.048957378 +-0.40941563 dedicate him -0.187397 +-1.1700908 prepare him -0.048957378 +-2.7186737 or herself -0.048957378 +-1.9222482 ; moreover -0.048957378 +-2.8890827 for getting -0.048957378 +-2.926151 is getting -0.048957378 +-3.2042007 , getting -0.048957378 +-2.9729643 and getting -0.048957378 +-2.35614 of getting -0.04895735 +-3.1516366 to getting -0.048957378 +-2.5191436 from getting -0.048957378 +-2.7391284 are getting -0.048957378 +-2.0018208 about getting -0.048957378 +-2.0188582 than getting -0.048957378 +-1.3846903 though getting -0.048957378 +-1.6338902 just getting -0.048957378 +-1.1663197 essentially getting -0.048957378 +-3.246793 and higher -0.048957378 +-2.7130423 of higher -0.048957378 +-1.866194 getting higher -0.048957378 +-3.644728 Nevertheless -0.048957378 +-3.102048 is extreme -0.048957378 +-3.106568 in extreme -0.048957378 +-1.6477528 rather extreme -0.048957378 +-3.0452757 for maturing -0.048957378 +-2.181358 could consume -0.048957378 +-3.4204583 the momentum -0.048957378 +-2.181358 could preciously -0.048957378 +-2.9060166 are innumerous -0.048957378 +-2.8461816 that hard -0.187397 +-2.8981466 is hard -0.048957378 +-3.0227573 the hard -0.048957378 +-3.1669025 , hard -0.048957378 +-2.934131 and hard -0.048957378 +-3.0017767 of hard -0.048957378 +-2.3004365 study hard -0.048957378 +-2.1687486 studying hard -0.048957378 +-1.5722067 very hard -0.048957378 +-2.680841 work hard -0.048957378 +-2.237141 so hard -0.048957378 +-1.9284186 how hard -0.048957378 +-1.2884092 demands hard -0.048957378 +-1.2884092 becomes hard -0.048957378 +-1.8678797 really hard -0.048957378 +-3.4000094 the so-called -0.048957378 +-3.014355 their so-called -0.048957378 +-3.1764908 and back -0.048957378 +-2.8507237 are back -0.048957378 +-1.1696702 giving back -0.048957378 +-1.9165361 think back -0.048957378 +-0.6938276 run back -0.048957378 +-3.2799814 and classrooms -0.048957378 +-3.014355 their classrooms -0.048957378 +-3.284121 the lack -0.048957378 +-3.3489213 to lack -0.048957378 +-2.3543274 The lack -0.048957378 +-2.821667 they lack -0.048957378 +-2.678043 or lack -0.048957378 +-3.2676826 of power -0.048957378 +-1.8378968 necessary power -0.048957378 +-3.3813157 to concentrate -0.048957378 +-1.9783378 should concentrate -0.048957378 +-2.0473466 must concentrate -0.048957378 +-1.4707885 instead concentrate -0.048957378 +-2.4177067 by active -0.048957378 +-2.4667776 an active -0.048957378 +-0.995135 active role -0.048957378 +-3.3624787 the process -0.048957378 +-1.4704666 educational process -0.048957378 +-2.0856535 learning process -0.048957378 +-2.237829 some ! -0.048957378 +-2.350965 more ! -0.048957378 +-1.4629536 loans ! -0.048957378 +-1.6346928 never ! -0.048957378 +-1.2895159 labor ! -0.048957378 +-1.1663197 soon ! -0.048957378 +-1.2895159 wanted ! -0.048957378 +-1.462143 habits ! -0.048957378 +-0.991978 wrong ! -0.048957378 +-0.6927047 guess ! -0.048957378 +-1.2895159 properly ! -0.048957378 +-0.6927047 Never ! -0.048957378 +-0.6927047 unite ! -0.048957378 +-3.0452757 for resting -0.048957378 +-1.9702765 themselves acquiring -0.048957378 +-1.5965295 knowledge exclusively -0.048957378 +-2.9793792 for poor -0.048957378 +-3.1828704 a poor -0.048957378 +-3.3677967 , poor -0.048957378 +-2.9430885 their poor -0.048957378 +-0.9939372 mean poor -0.048957378 +-2.090273 then poor -0.048957378 +-3.102048 is participation -0.048957378 +-2.1005187 then participation -0.048957378 +-1.4704666 poor participation -0.048957378 +-2.8176763 time dedicated -0.048957378 +-1.8373363 A solution -0.048957378 +-1.1708648 extreme solution -0.048957378 +-2.4184146 more scholarships -0.048957378 +-3.2908964 a government -0.048957378 +-2.3605163 the government -0.11268411 +-3.2085369 a system -0.048957378 +-3.284121 the system -0.048957378 +-0.6938276 enslavement system -0.048957378 +-1.1696702 economic system -0.048957378 +-0.6938276 healthcare system -0.048957378 +-2.728165 and allow -0.048957378 +-2.2018204 which allow -0.048957378 +-2.332246 would allow -0.048957378 +-1.536077 simply allow -0.048957378 +-3.4163227 to practice -0.048957378 +-1.5218441 into practice -0.13618872 +-0.9947796 whole practice -0.048957378 +-2.031715 In comparison -0.048957378 +-3.1215048 in comparison -0.048957378 +-2.0875707 that young -0.13618872 +-2.0564344 a young -0.048957378 +-2.967511 the young -0.048957378 +-3.11635 , young -0.048957378 +-2.0058713 often young -0.048957378 +-1.7731812 many young -0.048957378 +-2.0575187 any young -0.048957378 +-3.0623753 to young -0.048957378 +-2.6170933 on young -0.048957378 +-2.1114051 -RRB- young -0.048957378 +-1.3812549 Most young -0.048957378 +-2.2648418 are young -0.048957378 +-1.5825084 gives young -0.048957378 +-1.6365912 help young -0.187397 +-1.9307373 other young -0.048957378 +-2.3681657 all young -0.048957378 +-2.0495212 If young -0.048957378 +-1.6737398 give young -0.048957378 +-2.15471 as adults -0.187397 +-1.5933044 responsible adults -0.048957378 +-1.5214225 young adults -0.048957378 +-1.2942703 mature adults -0.048957378 +-0.6938276 younger adults -0.048957378 +-1.9208251 They tend -0.048957378 +-1.4714925 adults tend -0.048957378 +-2.2247844 is always -0.04895735 +-2.1368084 will always -0.187397 +-1.8603685 not always -0.04895735 +-2.9227278 students always -0.048957378 +-2.8200238 are always -0.048957378 +-1.7258272 n't always -0.048957378 +-1.8569338 was always -0.048957378 +-2.7186737 or specialisation -0.048957378 +-2.7567537 college counterparts -0.048957378 +-3.09512 Generally -0.187397 +-3.5096595 , non -0.048957378 +-2.920025 to adapt -0.048957378 +-3.2908964 a relatively -0.048957378 +-3.4000094 the relatively -0.048957378 +-2.983328 that short -0.048957378 +-2.1588707 very short -0.048957378 +-0.9944986 relatively short -0.048957378 +-0.69396824 severely short -0.048957378 +-2.7224927 a problem -0.048957378 +-2.839973 the problem -0.048957378 +-3.3971481 , problem -0.048957378 +-2.3543274 The problem -0.048957378 +-1.6435722 main problem -0.048957378 +-2.6124525 , since -0.04895735 +-2.7478716 work since -0.048957378 +-1.5906644 graduate since -0.048957378 +-1.2932827 wisely since -0.048957378 +-1.8303851 day since -0.048957378 +-1.3892777 clubs since -0.048957378 +-1.2928529 approach since -0.048957378 +-2.421319 this change -0.048957378 +-3.1586366 a change -0.048957378 +-2.677301 will change -0.048957378 +-3.2905889 to change -0.048957378 +-2.3728426 may change -0.048957378 +-0.99365675 quickly change -0.048957378 +-1.2928529 easily change -0.048957378 +-2.4184146 more manageable -0.048957378 +-2.859086 be concerned -0.048957378 +-3.246793 and concerned -0.048957378 +-2.7857137 work concerned -0.048957378 +-1.894961 less sophisticated -0.048957378 +-2.415659 more sophisticated -0.048957378 +-3.4931862 , whilst -0.048957378 +-2.8133025 time whilst -0.048957378 +-0.995135 whilst furthering -0.048957378 +-2.045974 valuable insight -0.048957378 +-3.2102208 and insight -0.048957378 +-1.469627 added insight -0.048957378 +-0.69396824 pecuniary insight -0.048957378 +-2.0331504 about assimilation -0.048957378 +-2.1210358 This bridging -0.048957378 +-3.2496762 the effect -0.048957378 +-2.4390335 an effect -0.048957378 +-1.3895458 immediate effect -0.048957378 +-0.69368714 bridging effect -0.048957378 +-0.69368714 detrimental effect -0.048957378 +-0.69368714 definite effect -0.048957378 +-2.7652595 job enables -0.048957378 +-2.7857137 work enables -0.048957378 +-2.02938 better enables -0.048957378 +-2.152201 -RRB- quickly -0.048957378 +-0.99501497 adapt quickly -0.048957378 +-3.0230536 their leap -0.048957378 +-2.2439 Working -0.08422062 +-1.8373363 : Working -0.048957378 +-2.1888087 it teaches -0.048957378 +-3.2102208 and teaches -0.048957378 +-2.7949562 time teaches -0.048957378 +-1.8906534 It teaches -0.187397 +-3.0111382 that theory -0.048957378 +-3.4000094 the theory -0.048957378 +-2.941432 that universities -0.048957378 +-3.1159992 and universities -0.048957378 +-2.2156067 help universities -0.048957378 +-1.7655468 Japanese universities -0.048957378 +-1.3909848 based universities -0.048957378 +-1.5902381 large universities -0.048957378 +-0.99365675 U.S. universities -0.048957378 +-2.1445498 will likely -0.048957378 +-1.8911151 less likely -0.048957378 +-2.1450179 most likely -0.048957378 +-2.326847 are likely -0.187397 +-2.0331504 about budgeting -0.048957378 +-1.892832 work ethics -0.04895735 +-3.0733142 a workplace -0.048957378 +-2.5038946 the workplace -0.1722646 +-3.2450058 , workplace -0.048957378 +-3.0156138 and workplace -0.048957378 +-3.0686293 of workplace -0.048957378 +-2.864925 their workplace -0.048957378 +-2.3173544 The workplace -0.048957378 +-1.5858314 f workplace -0.048957378 +-2.0117862 future workplace -0.048957378 +-1.1671549 successful workplace -0.048957378 +-1.7628887 understand workplace -0.048957378 +-3.4931862 , productivity -0.048957378 +-3.2799814 and productivity -0.048957378 +-1.3925304 practical manner -0.048957378 +-2.233106 these issues -0.048957378 +-3.1451926 and instead -0.048957378 +-2.5355608 working instead -0.048957378 +-2.1961105 studies instead -0.048957378 +-1.5920713 earning instead -0.048957378 +-2.2342522 education instead -0.048957378 +-0.69368714 tips instead -0.048957378 +-3.4204583 the core -0.048957378 +-3.2676826 of material -0.048957378 +-1.2838528 course material -0.048957378 +-3.0230536 their respective -0.048957378 +-3.09512 We -0.048957378 +-0.995135 We hope -0.048957378 +-3.014355 their principle -0.048957378 +-2.3729722 The principle -0.048957378 +-3.321535 the concern -0.048957378 +-3.21812 of concern -0.048957378 +-1.469627 major concern -0.048957378 +-0.9944986 principle concern -0.048957378 +-2.0607376 But -0.07333779 +-1.8391771 by definition -0.048957378 +-3.2908964 a worker -0.048957378 +-1.6920565 full-time worker -0.048957378 +-2.7708824 it follows -0.048957378 +-2.700248 with problems -0.048957378 +-2.533118 money problems -0.048957378 +-1.3366672 health problems -0.048957378 +-0.6938276 serious problems -0.048957378 +-0.6938276 respiratory problems -0.048957378 +-2.541019 is perhaps -0.048957378 +-2.6862712 will perhaps -0.048957378 +-2.9179077 , perhaps -0.048957378 +-2.933356 students perhaps -0.048957378 +-2.6663196 or perhaps -0.048957378 +-2.8351026 are perhaps -0.048957378 +-3.1356838 a fully -0.048957378 +-2.698859 not fully -0.048957378 +-3.126527 of fully -0.048957378 +-1.500062 become fully -0.048957378 +-2.6437812 or fully -0.048957378 +-2.8054512 are fully -0.048957378 +-1.5896223 graduate fully -0.048957378 +-1.5891323 perhaps fully -0.048957378 +-3.1159992 and families -0.048957378 +-2.1161668 their families -0.04895735 +-2.1352663 most families -0.048957378 +-1.7631377 Many families -0.048957378 +-2.0027099 where families -0.048957378 +-0.69354665 poorer families -0.048957378 +-0.99365675 wealthy families -0.048957378 +-2.7708678 Others -0.04895735 +-1.5961065 perhaps older -0.048957378 +-2.7972748 have saved -0.048957378 +-2.4503036 their courses -0.048957378 +-0.69410884 college-level courses -0.048957378 +-0.69410884 failing courses -0.048957378 +-1.2958019 courses began -0.048957378 +-2.722524 will fall -0.048957378 +-2.9150915 to fall -0.048957378 +-1.1710447 fall somewhere -0.048957378 +-3.0267124 in between -0.048957378 +-0.9933765 ground between -0.048957378 +-0.6934062 span between -0.048957378 +-1.683961 balance between -0.048957378 +-1.5335814 choose between -0.048957378 +-1.388153 choice between -0.048957378 +-0.6934062 split between -0.048957378 +-0.9933765 divided between -0.048957378 +-1.8956895 little bit -0.048957378 +-3.28005 of book -0.048957378 +-2.968909 that hold -0.048957378 +-2.0070503 employment hold -0.048957378 +-3.3489213 to hold -0.048957378 +-2.5414824 working hold -0.048957378 +-2.5439303 should hold -0.048957378 +-3.4479775 to escape -0.048957378 +-2.4667776 an escape -0.048957378 +-3.4204583 the confines -0.048957378 +-2.30063 school worlds -0.048957378 +-1.8670595 new worlds -0.048957378 +-2.873293 be brought -0.048957378 +-2.180893 has brought -0.048957378 +-3.0607083 is far -0.048957378 +-2.5414824 working far -0.048957378 +-1.7055064 so far -0.048957378 +-2.8507237 are far -0.048957378 +-1.293971 fre far -0.048957378 +-3.4650414 to dip -0.048957378 +-3.0230536 their toes -0.048957378 +-3.4204583 the unfathomable -0.048957378 +-0.6942867 unfathomable waters -0.048957378 +-1.6470779 fully immersed -0.048957378 +-3.644728 Thereby -0.048957378 +-2.9583905 , preparing -0.048957378 +-0.6942267 Thereby preparing -0.048957378 +-3.4650414 to traverse -0.048957378 +-2.8665538 the rest -0.50140065 +-3.4163227 to rest -0.048957378 +-1.1705118 proper rest -0.048957378 +-3.2496762 the lives -0.048957378 +-1.9203358 their lives -0.09038658 +-2.5355608 working lives -0.187397 +-2.0392778 's lives -0.048957378 +-1.3835938 our lives -0.048957378 +-1.3899099 daily lives -0.048957378 +-3.644728 Wages -0.048957378 +-2.5982223 from gainful -0.048957378 +-3.2908964 a sample -0.048957378 +-2.9150915 to sample -0.048957378 +-3.4204583 the delights -0.048957378 +-2.0280838 often want -0.048957378 +-2.8923202 students want -0.048957378 +-2.3601093 may want -0.048957378 +-1.7389883 they want -0.14912641 +-2.1387856 you want -0.048957378 +-2.2585382 who want -0.048957378 +-1.9038341 ft want -0.048957378 +-2.097815 parents want -0.048957378 +-0.88480955 might want -0.187397 +-2.2564893 we want -0.048957378 +-2.90557 to try -0.048957378 +-2.5549836 should try -0.048957378 +-2.2860384 who try -0.048957378 +-1.6757742 several things -0.048957378 +-2.0105228 valuable things -0.048957378 +-3.1325557 , things -0.048957378 +-2.2980707 many things -0.048957378 +-1.5793041 Such things -0.048957378 +-2.6551435 have things -0.048957378 +-1.671768 These things -0.048957378 +-2.0396717 good things -0.048957378 +-1.3765513 other things -0.048957378 +-1.68146 difficult things -0.048957378 +-1.3128718 these things -0.04895735 +-2.0508218 out things -0.048957378 +-1.8959295 those things -0.048957378 +-1.2759765 new things -0.187397 +-0.6921442 interesting things -0.048957378 +-1.1646541 buy things -0.048957378 +-0.6921442 obligatory things -0.048957378 +-1.734036 were previously -0.048957378 +-0.6942867 previously unavailable -0.048957378 +-1.4298301 well rounded -0.048957378 +-1.6465397 fully rounded -0.048957378 +-2.1861243 it takes -0.048957378 +-2.743439 job takes -0.048957378 +-2.0659218 student takes -0.187397 +-2.0804396 learning takes -0.048957378 +-0.9942178 side takes -0.048957378 +-2.1493602 will hopefully -0.048957378 +-3.4931862 , hopefully -0.048957378 +-2.6862712 will appreciate -0.048957378 +-3.1451926 and appreciate -0.048957378 +-2.5737934 to appreciate -0.13618872 +-2.933356 students appreciate -0.048957378 +-2.0207255 better appreciate -0.048957378 +-1.16925 hopefully appreciate -0.048957378 +-2.9060166 are paying -0.048957378 +-3.4204583 the next -0.048957378 +-3.4479775 to round -0.048957378 +-0.99501497 next round -0.048957378 +-3.28005 of drinks -0.048957378 +-1.8328347 Students -0.16934635 +-3.2799814 and definitely -0.048957378 +-2.165397 very definitely -0.048957378 +-3.321535 the harder -0.048957378 +-2.349459 study harder -0.048957378 +-2.1138577 much harder -0.048957378 +-1.6897244 worked harder -0.048957378 +-2.7044797 can suffer -0.048957378 +-3.4479775 to suffer -0.048957378 +-3.644728 Suffering -0.048957378 +-2.7665153 it wo -0.048957378 +-1.8377693 customers wo -0.048957378 +-1.733866 n't kill -0.048957378 +-2.569465 Part -0.40449065 +-2.859086 be fun -0.048957378 +-2.1046717 having fun -0.048957378 +-2.7835681 have fun -0.048957378 +-2.7708678 Jobs -0.04895735 +-2.9060166 are mostly -0.048957378 +-3.1764908 and basic -0.048957378 +-2.1553595 very basic -0.048957378 +-2.3718047 do basic -0.048957378 +-0.6938276 mostly basic -0.048957378 +-0.6938276 mastering basic -0.048957378 +-3.3624787 the simple -0.048957378 +-3.246793 and simple -0.048957378 +-2.010357 doing simple -0.048957378 +-3.284121 the tasks -0.048957378 +-1.293971 small tasks -0.048957378 +-0.58528477 simple tasks -0.048957378 +-0.6938276 prioritize tasks -0.048957378 +-0.9942178 perform tasks -0.048957378 +-3.4931862 , suffering -0.048957378 +-3.2799814 and suffering -0.048957378 +-3.21812 of labor -0.048957378 +-2.9787707 their labor -0.048957378 +-1.1700908 menial labor -0.048957378 +-0.69396824 manual labor -0.048957378 +-1.733866 n't fatal -0.048957378 +-1.6470779 But timing -0.048957378 +-3.5120494 Japan -0.048957378 +-2.720708 not Japan -0.048957378 +-1.4716873 in Japan -0.16310708 +-3.3187766 to Japan -0.048957378 +-1.8600016 like Japan -0.048957378 +-1.2934115 including Japan -0.048957378 +-2.5593464 in America -0.048957378 +-0.4095183 North America -0.048957378 +-3.0090466 for rent -0.048957378 +-3.321535 the rent -0.048957378 +-2.9385355 , rent -0.048957378 +-0.69396824 monthly rent -0.048957378 +-3.4931862 , married -0.048957378 +-2.2787373 get married -0.048957378 +-3.4860332 generally -0.048957378 +-2.6044638 student generally -0.048957378 +-3.1159992 and generally -0.048957378 +-2.7478716 work generally -0.048957378 +-2.8200238 are generally -0.048957378 +-1.8581281 Japan generally -0.048957378 +-1.7266748 person generally -0.048957378 +-2.814926 the freedom -0.048957378 +-3.340304 , freedom -0.048957378 +-3.0415518 in freedom -0.048957378 +-2.6822872 of freedom -0.048957378 +-2.4244723 experience freedom -0.048957378 +-2.385586 more freedom -0.048957378 +-1.2928529 With freedom -0.048957378 +-3.4204583 the sophomore -0.048957378 +-2.1525254 most sophomores -0.048957378 +-3.4204583 the 4th -0.048957378 +-3.2649243 a service -0.048957378 +-1.1705118 community service -0.048957378 +-1.1705118 customer service -0.048957378 +-2.401619 fs prime -0.048957378 +-3.2496762 the purpose -0.048957378 +-1.0611942 main purpose -0.048957378 +-0.69368714 prime purpose -0.048957378 +-0.69368714 sole purpose -0.048957378 +-1.16925 entire purpose -0.048957378 +-0.69368714 ultimate purpose -0.048957378 +-3.644728 Cleaning -0.048957378 +-0.6942867 Cleaning toilets -0.048957378 +-2.7186737 or serving -0.048957378 +-0.6942867 serving burgers -0.048957378 +-2.709646 not waste -0.048957378 +-1.467118 completely waste -0.048957378 +-3.2905889 to waste -0.048957378 +-1.7974223 real waste -0.048957378 +-2.0363014 than waste -0.048957378 +-1.5906644 complete waste -0.048957378 +-0.69354665 Why waste -0.048957378 +-1.2343968 I think -0.23072205 +-2.709646 not think -0.048957378 +-2.8627641 to think -0.048957378 +-2.3863747 also think -0.048957378 +-2.1488202 you think -0.048957378 +-2.2701116 who think -0.048957378 +-1.9103436 ft think -0.187397 +-2.9652722 for anything -0.048957378 +-3.340304 , anything -0.048957378 +-2.362637 do anything -0.048957378 +-2.0860887 If anything -0.048957378 +-2.2601566 get anything -0.048957378 +-2.075287 learning anything -0.048957378 +-2.1933823 learn anything -0.048957378 +-2.4754071 people argue -0.048957378 +-2.3401537 would argue -0.048957378 +-2.7708678 Maybe -0.27955192 +-3.3813157 to serve -0.048957378 +-2.4044359 also serve -0.048957378 +-2.2353065 only serve -0.048957378 +-1.7690709 go serve -0.048957378 +-2.7186737 or clean -0.048957378 +-2.2786872 some dishes -0.048957378 +-0.6942267 washing dishes -0.048957378 +-3.0178597 that pales -0.048957378 +-2.0889816 learning international -0.048957378 +-3.321535 the law -0.048957378 +-3.428628 , law -0.048957378 +-2.7197275 as law -0.048957378 +-0.69396824 international law -0.048957378 +-2.7131412 or advanced -0.048957378 +-2.0319214 about advanced -0.048957378 +-0.995135 advanced mechanical -0.048957378 +-2.7082691 I focused -0.048957378 +-2.8269253 be focused -0.048957378 +-3.0607083 is focused -0.048957378 +-3.1764908 and focused -0.048957378 +-1.1696702 heavily focused -0.048957378 +-2.7282374 I truly -0.048957378 +-3.102048 is truly -0.048957378 +-3.4163227 to truly -0.048957378 +-1.1040449 reasons why -0.11268411 +-2.2305765 is why -0.04895735 +-1.9644217 reason why -0.048957378 +-2.2770538 so why -0.048957378 +-1.5351007 see why -0.048957378 +-2.2877035 be made -0.048957378 +-2.709646 not made -0.048957378 +-2.167287 has made -0.048957378 +-2.3863747 also made -0.048957378 +-2.7428439 have made -0.048957378 +-1.467118 effort made -0.048957378 +-1.8569338 was made -0.048957378 +-2.4184146 more affordable -0.048957378 +-2.9060166 are smart -0.048957378 +-2.7186737 or gifted -0.048957378 +-2.749941 and proper -0.048957378 +-2.2787373 get proper -0.048957378 +-2.8836398 to reduce -0.048957378 +-2.2224748 help reduce -0.048957378 +-2.1702561 could reduce -0.048957378 +-1.4690876 loans reduce -0.048957378 +-0.9942178 significantly reduce -0.048957378 +-2.1656544 I firmly -0.048957378 +-2.880715 be encouraged -0.048957378 +-3.246793 and reach -0.048957378 +-2.84126 they reach -0.048957378 +-2.2854536 we reach -0.048957378 +-2.0134768 doing meaningless -0.048957378 +-3.5096595 , unchallenging -0.048957378 +-3.2979217 and pointless -0.048957378 +-3.0090466 for anyone -0.048957378 +-2.983328 that anyone -0.048957378 +-2.4081247 by anyone -0.048957378 +-3.089379 in anyone -0.048957378 +-2.2854962 so fortunate -0.048957378 +-1.865197 was fortunate -0.048957378 +-1.2950919 anyone fortunate -0.048957378 +-2.920025 to attend -0.04895735 +-3.0382125 for everyone -0.048957378 +-1.295562 Not everyone -0.048957378 +-2.2356086 time wisely -0.187397 +-2.7730234 have wisely -0.048957378 +-2.4039898 more wisely -0.048957378 +-2.099842 spend wisely -0.048957378 +-3.246793 and moving -0.048957378 +-1.9191778 ft moving -0.048957378 +-1.7327547 before moving -0.048957378 +-3.4479775 to refrain -0.048957378 +-2.5596967 should refrain -0.048957378 +-1.8375617 your circle -0.048957378 +-0.6942267 vicious circle -0.048957378 +-2.6468444 student knows -0.048957378 +-0.6942267 Everyone knows -0.048957378 +-2.1511004 most precious -0.048957378 +-2.2313263 these precious -0.048957378 +-0.995135 precious commodity -0.048957378 +-3.644728 Consider -0.048957378 +-2.469975 an average -0.048957378 +-3.601002 20 -0.048957378 +-2.4133134 by 20 -0.048957378 +-2.5535734 working 20 -0.048957378 +-3.087084 of hours -0.048957378 +-2.5126505 working hours -0.048957378 +-2.2716413 school hours -0.048957378 +-1.9746685 enough hours -0.048957378 +-0.2540477 20 hours -0.04895735 +-0.6931254 25 hours -0.048957378 +-1.4646233 few hours -0.048957378 +-1.5302515 long hours -0.048957378 +-1.1675731 40 hours -0.048957378 +-0.6931254 24 hours -0.048957378 +-3.6294703 5 -0.048957378 +-3.4931862 , 5 -0.048957378 +-2.596196 student days -0.048957378 +-1.842265 college days -0.048957378 +-1.3174148 these days -0.20946135 +-1.4667772 free days -0.048957378 +-2.3675284 my days -0.048957378 +-0.9933765 5 days -0.048957378 +-1.292295 Those days -0.048957378 +-0.9933765 five days -0.048957378 +-2.2188385 a week -0.048957378 +-2.349459 study week -0.048957378 +-0.40941563 per week -0.04895735 +-0.69396824 5-day week -0.048957378 +-3.09512 Getting -0.048957378 +-3.2799814 and ready -0.048957378 +-0.99501497 Getting ready -0.048957378 +-3.2979217 and driving -0.048957378 +-2.1279233 can easily -0.048957378 +-2.1770248 could easily -0.048957378 +-1.5370556 quit easily -0.048957378 +-1.8359201 extra hour -0.048957378 +-2.1350715 after hour -0.048957378 +-0.9944986 Every hour -0.048957378 +-0.69396824 waking hour -0.048957378 +-3.0382125 for each -0.048957378 +-1.295562 hour each -0.048957378 +-2.3846152 this day -0.048957378 +-2.868108 that day -0.048957378 +-3.036213 a day -0.048957378 +-2.7474363 the day -0.048957378 +-2.2500978 one day -0.048957378 +-1.6338902 following day -0.048957378 +-2.3936956 all day -0.048957378 +-1.4629536 free day -0.048957378 +-1.3846903 every day -0.048957378 +-0.991978 each day -0.048957378 +-1.1663197 whose day -0.048957378 +-0.6927047 Their day -0.048957378 +-0.6927047 6 day -0.048957378 +-2.0522041 's 25 -0.048957378 +-2.8133025 time lost -0.048957378 +-0.6942267 causing lost -0.048957378 +-1.2219176 hours per -0.187397 +-0.99501497 lost per -0.048957378 +-2.8665538 the risk -0.187397 +-2.5259967 at risk -0.048957378 +-1.837133 high risk -0.048957378 +-2.722524 will cause -0.048957378 +-1.8141055 may cause -0.048957378 +-2.873293 be added -0.048957378 +-2.3605163 the added -0.048957378 +-3.2676826 of frustration -0.048957378 +-1.4711709 added frustration -0.048957378 +-0.995135 Maintaining friendly -0.048957378 +-2.726789 with coworkers -0.048957378 +-3.2799814 and coworkers -0.048957378 +-3.1303647 is bound -0.048957378 +-2.880715 be tempting -0.048957378 +-2.880715 be urged -0.048957378 +-2.1594095 as soon -0.187397 +-1.6454853 force soon -0.048957378 +-1.8362309 right soon -0.048957378 +-2.1737337 First -0.5104914 +-1.8373363 : First -0.048957378 +-3.2908964 a dependent -0.048957378 +-1.6465397 fully dependent -0.048957378 +-0.995135 dependent child -0.048957378 +-3.1303647 is rapidly -0.048957378 +-0.6942867 rapidly underway -0.048957378 +-3.644728 Quite -0.048957378 +-2.7972748 have moved -0.048957378 +-3.093125 a home -0.048957378 +-3.1340673 the home -0.048957378 +-2.5388384 from home -0.048957378 +-1.4646233 leave home -0.048957378 +-1.6161044 at home -0.04895735 +-2.2490206 get home -0.048957378 +-1.1675731 staying home -0.048957378 +-1.8531619 family home -0.048957378 +-1.863004 going home -0.048957378 +-1.3873869 back home -0.048957378 +-2.0882785 having responsibility -0.048957378 +-2.757981 the responsibility -0.187397 +-3.2241242 , responsibility -0.048957378 +-2.6443284 and responsibility -0.048957378 +-2.3596568 of responsibility -0.13618872 +-1.9865333 financial responsibility -0.048957378 +-2.6019366 or responsibility -0.048957378 +-1.3801705 take responsibility -0.187397 +-2.1188965 social responsibility -0.048957378 +-1.3875152 teaches responsibility -0.048957378 +-1.3853806 certainly responsibility -0.048957378 +-0.99225736 comes responsibility -0.048957378 +-3.5096595 , ranging -0.048957378 +-3.4650414 to bed -0.048957378 +-2.859086 be sure -0.048957378 +-1.5375328 making sure -0.048957378 +-2.0486636 make sure -0.048957378 +-3.014355 their homework -0.048957378 +-0.89532995 doing homework -0.048957378 +-3.120598 is completed -0.048957378 +-1.295562 properly completed -0.048957378 +-1.947228 university dormitories -0.048957378 +-1.17113 preparing breakfast -0.048957378 +-3.4625692 , lunch -0.048957378 +-2.5259967 at lunch -0.048957378 +-1.3916435 eat lunch -0.048957378 +-3.2979217 and dinner -0.048957378 +-2.8541157 they never-the-less -0.048957378 +-2.7044797 can build -0.048957378 +-2.5943775 to build -0.04895735 +-2.469975 an identity -0.048957378 +-2.7368789 I fve -0.048957378 +-2.1655056 you fve -0.048957378 +-2.7556913 not mentioned -0.048957378 +-0.9947796 fve mentioned -0.048957378 +-1.2950919 above mentioned -0.048957378 +-2.7413492 I admit -0.048957378 +-2.3941946 may seem -0.048957378 +-2.454793 all seem -0.048957378 +-3.4650414 to handle -0.048957378 +-1.8943456 a person -0.08422062 +-3.3187766 to person -0.048957378 +-2.5665698 from person -0.048957378 +-2.0207255 better person -0.048957378 +-2.099497 young person -0.048957378 +-1.16925 rounded person -0.048957378 +-1.4715303 late teens -0.048957378 +-2.7385423 it early -0.048957378 +-1.884914 an early -0.187397 +-2.678043 or early -0.048957378 +-1.4690876 especially early -0.048957378 +-2.0805624 out early -0.048957378 +-1.3926156 early twenties -0.048957378 +-2.4456112 that adding -0.048957378 +-1.5957409 By adding -0.048957378 +-2.469975 an unnecessary -0.048957378 +-3.4204583 the contrary -0.048957378 +-3.4204583 the aforementioned -0.048957378 +-2.4003265 my list -0.048957378 +-0.6942267 aforementioned list -0.048957378 +-3.28005 of remuneration -0.048957378 +-1.7714736 personal satisfaction -0.048957378 +-3.106568 in gaining -0.048957378 +-3.244391 of gaining -0.048957378 +-2.8837602 are gaining -0.048957378 +-2.7056403 and independence -0.048957378 +-3.1699133 of independence -0.048957378 +-3.3187766 to independence -0.048957378 +-1.0976044 financial independence -0.04895735 +-2.0399847 his independence -0.048957378 +-0.69368714 fiscal independence -0.048957378 +-3.120598 is enormous -0.048957378 +-2.4667776 an enormous -0.048957378 +-1.4715303 added self-esteem -0.048957378 +-2.7044797 can manifest -0.048957378 +-1.3923441 likely manifest -0.048957378 +-3.1293185 in improved -0.048957378 +-3.2358162 a growing -0.048957378 +-3.21812 of growing -0.048957378 +-1.8631164 was growing -0.048957378 +-0.69396824 youngsters growing -0.048957378 +-3.2676826 of securing -0.048957378 +-3.4479775 to securing -0.048957378 +-3.4931862 , dating -0.048957378 +-1.6472148 makes dating -0.048957378 +-3.2799814 and easier -0.048957378 +-1.5384284 much easier -0.048957378 +-3.4479775 to ask -0.048957378 +-2.3987331 fs ask -0.048957378 +-3.2676826 of view -0.048957378 +-1.9208251 They view -0.048957378 +-3.2649243 a big -0.048957378 +-2.3701181 many big -0.048957378 +-2.2866669 one big -0.048957378 +-1.1708648 big party -0.048957378 +-0.99501497 third party -0.048957378 +-3.3971481 , drinking -0.048957378 +-3.1764908 and drinking -0.048957378 +-2.0805624 out drinking -0.048957378 +-2.0069268 where drinking -0.048957378 +-0.6938276 binge drinking -0.048957378 +-2.4177067 by joining -0.048957378 +-3.2799814 and joining -0.048957378 +-3.284121 the clubs -0.048957378 +-3.0728445 in clubs -0.048957378 +-2.1414988 social clubs -0.048957378 +-0.9942178 joining clubs -0.048957378 +-1.5351007 All clubs -0.048957378 +-2.9060166 are washing -0.048957378 +-2.2445176 My -0.08422062 +-2.708228 can create -0.048957378 +-2.0329945 In Australia -0.04895735 +-2.7282374 I studied -0.048957378 +-1.8655307 being studied -0.048957378 +-2.7835681 have studied -0.048957378 +-3.539724 University -0.048957378 +-3.284121 the University -0.048957378 +-3.0728445 in University -0.048957378 +-2.9605632 their University -0.048957378 +-0.6938276 Northeastern University -0.048957378 +-3.28005 of Technology -0.048957378 +-0.6942867 Technology Sydney -0.048957378 +-1.734036 were 8,000 -0.048957378 +-3.644728 75 -0.048957378 +-0.6942267 75 % -0.048957378 +-0.6942267 100 % -0.048957378 +-2.7368789 I knew -0.048957378 +-2.8497334 they knew -0.048957378 +-2.452609 for companies -0.048957378 +-2.3482852 many companies -0.048957378 +-3.0415518 in companies -0.048957378 +-1.5340091 change companies -0.048957378 +-1.1688302 big companies -0.048957378 +-1.3892777 public companies -0.048957378 +-0.69354665 card companies -0.048957378 +-1.5961065 companies hire -0.048957378 +-3.4479775 to test -0.048957378 +-1.1708648 perfect test -0.048957378 +-2.4573214 their character -0.048957378 +-2.4247963 a positive -0.04895735 +-2.165397 very positive -0.048957378 +-1.6465397 same thing -0.048957378 +-1.295562 positive thing -0.048957378 +-2.4149823 this point -0.048957378 +-3.1356838 a point -0.048957378 +-3.1880362 the point -0.048957378 +-2.3557408 important point -0.048957378 +-2.2654846 what point -0.048957378 +-0.80643505 final point -0.187397 +-1.4662849 My point -0.048957378 +-0.6934062 grade point -0.048957378 +-2.1210358 This indirectly -0.048957378 +-2.7708824 it cheaper -0.048957378 +-2.7364948 on taxpayers -0.048957378 +-2.031715 In summary -0.048957378 +-3.1215048 in summary -0.048957378 +-3.644728 Overall -0.048957378 +-2.9652722 for both -0.048957378 +-3.022964 is both -0.048957378 +-3.340304 , both -0.048957378 +-2.8627641 to both -0.048957378 +-1.6861866 success both -0.048957378 +-1.7266748 lives both -0.048957378 +-0.69354665 juggle both -0.048957378 +-2.5542762 the economy -0.27955192 +-1.5370556 local economy -0.048957378 +-1.1705118 general economy -0.048957378 +-1.9471688 how easy -0.048957378 +-1.9222652 too easy -0.048957378 +-3.4650414 to hide -0.048957378 +-2.9150915 to stay -0.048957378 +-2.4488342 them stay -0.048957378 +-1.1710447 stay attached -0.048957378 +-3.3047535 a backdrop -0.048957378 +-3.28005 of lots -0.048957378 +-3.28005 of bare -0.048957378 +-0.6942867 bare facts -0.048957378 +-2.6392057 student obtain -0.048957378 +-2.3912728 to obtain -0.048957378 +-2.84126 they obtain -0.048957378 +-2.880715 be converted -0.048957378 +-3.2676826 of wisdom -0.048957378 +-0.6942267 coined wisdom -0.048957378 +-3.4204583 the broader -0.048957378 +-2.2803774 some departments -0.048957378 +-1.8385657 focus strictly -0.048957378 +-2.2389286 is common -0.13618872 +-2.165397 very common -0.048957378 +-3.644728 More -0.048957378 +-3.2908964 a break -0.048957378 +-0.99501497 summer break -0.048957378 +-2.2051396 studies -- -0.048957378 +-2.2854962 so -- -0.048957378 +-1.4704666 entirely -- -0.048957378 +-3.0230536 their subconscious -0.048957378 +-0.6942867 subconscious processes -0.048957378 +-3.4204583 the information -0.048957378 +-2.6489563 with class -0.048957378 +-3.0733142 a class -0.048957378 +-1.6798586 full-time class -0.048957378 +-2.2123137 in class -0.13618872 +-3.1932025 to class -0.048957378 +-1.808525 my class -0.048957378 +-1.586511 between class -0.048957378 +-0.6929851 pure class -0.048957378 +-1.9465262 our class -0.048957378 +-0.6929851 swimming class -0.048957378 +-0.9925369 miss class -0.048957378 +-3.2102208 and concentration -0.048957378 +-2.9787707 their concentration -0.048957378 +-2.8669279 are concentration -0.048957378 +-2.22595 help concentration -0.048957378 +-1.5956279 Such connections -0.048957378 +-2.1510062 social connections -0.048957378 +-2.8497334 they ca -0.048957378 +-0.6942267 Customers ca -0.048957378 +-2.5982223 from pure -0.048957378 +-3.2979217 and association -0.048957378 +-3.2979217 and faculty -0.048957378 +-2.7044797 can enrich -0.048957378 +-0.99501497 greatly enrich -0.048957378 +-1.3382268 those whose -0.048957378 +-1.7708699 individual whose -0.048957378 +-2.7143507 will ultimately -0.048957378 +-2.4250028 and ultimately -0.04895735 +-2.3904223 may ultimately -0.048957378 +-1.3925304 ultimately remain -0.048957378 +-3.2649243 a company -0.048957378 +-2.2866669 one company -0.048957378 +-1.6458207 bad company -0.048957378 +-3.3047535 a network -0.048957378 +-2.8665538 the end -0.13618872 +-2.966896 students end -0.048957378 +-2.2860384 who end -0.048957378 +-2.1375635 up needing -0.048957378 +-3.644728 Student -0.048957378 +-2.7983925 work record -0.048957378 +-3.2799814 and references -0.048957378 +-2.0866277 good references -0.048957378 +-2.5283554 the individual -0.099168696 +-2.9262898 their individual -0.048957378 +-2.34165 The individual -0.048957378 +-1.5564684 an individual -0.04895735 +-1.5914499 responsible individual -0.048957378 +-1.3888488 every individual -0.048957378 +-0.69354665 minded individual -0.048957378 +-3.4479775 to coursework -0.048957378 +-2.0508933 's coursework -0.048957378 +-3.2676826 of applications -0.048957378 +-1.3922309 practical applications -0.048957378 +-3.644728 Practical -0.048957378 +-2.415659 more background -0.048957378 +-0.6942267 Practical background -0.048957378 +-3.4204583 the principles -0.048957378 +-2.4095836 that he -0.048957378 +-2.6838725 job he -0.048957378 +-3.2450058 , he -0.048957378 +-2.668018 not he -0.048957378 +-3.0156138 and he -0.048957378 +-2.66345 as he -0.048957378 +-2.3427842 skills he -0.048957378 +-2.0724325 If he -0.048957378 +-1.7231714 before he -0.048957378 +-1.6362174 since he -0.048957378 +-0.6929851 principles he -0.048957378 +-3.4625692 , she -0.048957378 +-2.7024844 or she -0.04895735 +-2.100188 If she -0.048957378 +-3.4204583 the specific -0.048957378 +-1.4711709 eventually wants -0.048957378 +-0.99501497 definitely wants -0.048957378 +-3.644728 Real-world -0.048957378 +-3.428628 , mature -0.048957378 +-3.3813157 to mature -0.048957378 +-2.9554257 students mature -0.048957378 +-2.8669279 are mature -0.048957378 +-2.1061108 into functioning -0.048957378 +-1.469627 productive members -0.048957378 +-2.4438105 all members -0.048957378 +-1.5935729 adult members -0.048957378 +-0.69396824 Family members -0.048957378 +-2.7665153 it instils -0.048957378 +-2.7747319 job instils -0.048957378 +-3.2177634 the discipline -0.048957378 +-2.6044638 student discipline -0.048957378 +-3.1159992 and discipline -0.048957378 +-3.1476786 of discipline -0.048957378 +-1.9993248 financial discipline -0.048957378 +-0.99365675 instils discipline -0.048957378 +-1.1688302 greater discipline -0.048957378 +-2.596728 to fulfill -0.04895735 +-2.8665538 the expectations -0.048957378 +-1.9436464 no expectations -0.048957378 +-0.69410884 realistic expectations -0.048957378 +-3.2979217 and abide -0.048957378 +-3.4204583 the rules -0.048957378 +-3.28005 of filing -0.048957378 +-0.6942867 filing correspondence -0.048957378 +-1.2840128 being organized -0.048957378 +-3.2979217 and attentive -0.048957378 +-3.4650414 to details -0.048957378 +-3.3813157 to realize -0.048957378 +-2.439348 them realize -0.048957378 +-1.863773 Japan realize -0.048957378 +-1.536077 us realize -0.048957378 +-2.709418 with co-workers -0.048957378 +-2.983328 that co-workers -0.048957378 +-2.9787707 their co-workers -0.048957378 +-2.3897333 my co-workers -0.048957378 +-3.246793 and rely -0.048957378 +-2.966896 students rely -0.048957378 +-1.2950919 co-workers rely -0.048957378 +-3.4204583 the efficient -0.048957378 +-2.7267487 will motivate -0.048957378 +-2.859086 be diligent -0.048957378 +-3.102048 is diligent -0.048957378 +-2.7556913 not diligent -0.048957378 +-2.469975 an earlier -0.048957378 +-3.0267124 in age -0.048957378 +-3.0886452 and age -0.048957378 +-3.126527 of age -0.048957378 +-2.1659796 college age -0.048957378 +-2.095972 young age -0.048957378 +-1.3886466 early age -0.048957378 +-0.6934062 earlier age -0.048957378 +-0.6934062 correct age -0.048957378 +-2.7708678 Second -0.27955192 +-2.7796392 job broadens -0.048957378 +-2.968909 that interest -0.048957378 +-3.1933482 of interest -0.048957378 +-2.397768 more interest -0.048957378 +-0.6938276 broadens interest -0.048957378 +-0.6938276 losing interest -0.048957378 +-3.2979217 and opens -0.048957378 +-1.1710447 wider array -0.048957378 +-2.7542121 job possibilities -0.048957378 +-1.9186237 career possibilities -0.048957378 +-2.223746 these possibilities -0.048957378 +-0.69396824 diminished possibilities -0.048957378 +-3.4625692 , yet -0.048957378 +-3.246793 and yet -0.048957378 +-2.7835681 have yet -0.048957378 +-3.2908964 a clear -0.048957378 +-1.5956279 made clear -0.048957378 +-3.644728 Actual -0.048957378 +-2.6302562 student determine -0.048957378 +-3.3813157 to determine -0.048957378 +-2.22595 help determine -0.048957378 +-1.1700908 sometimes determine -0.048957378 +-3.2358162 a kind -0.048957378 +-2.2714837 some kind -0.048957378 +-2.8669279 are kind -0.048957378 +-1.7067324 what kind -0.187397 +-1.7717999 he likes -0.048957378 +-2.7186737 or dislikes -0.048957378 +-2.208895 which managerial -0.048957378 +-3.2676826 of finance -0.048957378 +-2.1035345 -LRB- finance -0.048957378 +-3.244391 of human -0.048957378 +-2.7283971 as human -0.048957378 +-2.7024844 or human -0.048957378 +-3.2102208 and resources -0.048957378 +-1.9642754 other resources -0.048957378 +-1.1700908 human resources -0.048957378 +-1.1700908 economic resources -0.048957378 +-2.2911956 who interns -0.048957378 +-2.469975 an advertising -0.048957378 +-0.6942867 advertising agency -0.048957378 +-3.4163227 to discover -0.048957378 +-2.3904223 may discover -0.048957378 +-2.1770248 could discover -0.048957378 +-2.469975 an aptitude -0.048957378 +-3.0452757 for graphic -0.048957378 +-0.6942867 graphic design -0.048957378 +-2.7186737 or copywriting -0.048957378 +-1.5961065 Such hands-on -0.048957378 +-3.3047535 a richer -0.048957378 +-1.7695215 individual basis -0.048957378 +-0.69410884 richer basis -0.048957378 +-0.69410884 temporary basis -0.048957378 +-3.0246768 for planning -0.048957378 +-2.444559 experience planning -0.048957378 +-2.0098362 financial planning -0.048957378 +-2.0522041 than waiting -0.048957378 +-2.9793792 for around -0.048957378 +-2.4508684 people around -0.048957378 +-1.3905807 focused around -0.048957378 +-0.69368714 waiting around -0.048957378 +-1.3899099 look around -0.048957378 +-1.2934115 places around -0.048957378 +-3.4204583 the rigors -0.048957378 +-3.2979217 and dynamics -0.048957378 +-3.644728 Usually -0.048957378 +-3.4625692 , confidence -0.048957378 +-1.8928705 little confidence -0.048957378 +-1.8044025 gain confidence -0.048957378 +-1.9229224 too ggreen -0.048957378 +-3.5692832 h -0.048957378 +-2.1451206 world h -0.048957378 +-0.69396824 ggreen h -0.048957378 +-0.69396824 Works h -0.048957378 +-2.7267487 will familiarize -0.048957378 +-2.4667776 an office -0.048957378 +-1.9471688 how office -0.048957378 +-3.246793 and hierarchies -0.048957378 +-2.1481786 social hierarchies -0.048957378 +-0.9947796 office hierarchies -0.048957378 +-2.708228 can situate -0.048957378 +-3.3047535 a newcomer -0.048957378 +-3.0382125 for guidance -0.048957378 +-0.99501497 providing guidance -0.048957378 +-3.2979217 and mentoring -0.048957378 +-3.09512 Attending -0.048957378 +-3.5096595 , writing -0.048957378 +-0.6942867 writing essays -0.048957378 +-3.2799814 and reports -0.048957378 +-1.5956279 been reports -0.048957378 +-3.4000094 the side -0.048957378 +-1.5378767 either side -0.048957378 +-2.342011 would normally -0.048957378 +-2.3059957 be taken -0.048957378 +-2.8837602 are taken -0.048957378 +-1.5946901 best taken -0.048957378 +-3.0808864 is needed -0.048957378 +-2.7949562 time needed -0.048957378 +-2.1138577 much needed -0.048957378 +-1.170325 obtaining needed -0.048957378 +-3.1215048 in turn -0.048957378 +-1.295562 easily turn -0.048957378 +-3.28005 of sleep -0.048957378 +-2.559273 is essential -0.048957378 +-1.8888423 an essential -0.048957378 +-0.69410884 sleep essential -0.048957378 +-2.7226624 job later -0.048957378 +-2.5359309 in later -0.048957378 +-2.3355312 study later -0.048957378 +-2.6549041 or later -0.048957378 +-1.9133835 career later -0.048957378 +-1.4277198 need later -0.048957378 +-0.69354665 illness later -0.048957378 +-2.6641047 can actually -0.048957378 +-2.677301 will actually -0.048957378 +-3.1476786 of actually -0.048957378 +-3.2905889 to actually -0.048957378 +-2.7665734 time actually -0.048957378 +-2.5331511 should actually -0.048957378 +-2.80292 they actually -0.048957378 +-3.644728 Along -0.048957378 +-2.9060166 are career-oriented -0.048957378 +-2.9982424 that choosing -0.048957378 +-3.106568 in choosing -0.048957378 +-1.5948578 By choosing -0.048957378 +-2.2445176 All -0.048957378 +-2.7972748 have secretary -0.048957378 +-3.2649243 a manager -0.048957378 +-3.3624787 the manager -0.048957378 +-3.4625692 , manager -0.048957378 +-3.2908964 a treasurer -0.048957378 +-3.2799814 and treasurer -0.048957378 +-3.09512 Taking -0.048957378 +-3.0452757 for instance -0.187397 +-2.8541157 they plan -0.048957378 +-3.3047535 a branch -0.048957378 +-3.246793 and budget -0.048957378 +-3.4163227 to budget -0.048957378 +-2.393125 fs budget -0.048957378 +-2.1656544 I fd -0.048957378 +-2.4199624 by saying -0.048957378 +-1.734036 lives inside -0.048957378 +-2.859086 be concentrating -0.048957378 +-2.7556913 not concentrating -0.048957378 +-3.246793 and concentrating -0.048957378 +-2.45335 this fact -0.048957378 +-2.878185 the fact -0.27955192 +-2.2791026 one hand -0.048957378 +-1.4668891 first hand -0.048957378 +-1.961792 other hand -0.11268411 +-1.5359479 second hand -0.048957378 +-0.6938276 kitchen hand -0.048957378 +-3.4479775 to loss -0.048957378 +-1.1708648 causes loss -0.048957378 +-1.969951 other genuine -0.048957378 +-2.873293 be engaging -0.048957378 +-3.4931862 , engaging -0.048957378 +-2.920025 to play -0.048957378 +-2.469975 an instrument -0.048957378 +-3.5096595 , sports -0.048957378 +-2.6862712 will simply -0.048957378 +-2.130835 or simply -0.048957378 +-2.8351026 are simply -0.048957378 +-1.3895458 developed simply -0.048957378 +-2.323799 would simply -0.048957378 +-0.9939372 well-being simply -0.048957378 +-3.4931862 , hang -0.048957378 +-1.5378767 simply hang -0.048957378 +-2.0880563 good friend -0.048957378 +-2.7187858 with developing -0.048957378 +-3.246793 and developing -0.048957378 +-2.7259424 on developing -0.048957378 +-3.4204583 the danger -0.048957378 +-3.28005 of ending -0.048957378 +-3.3047535 a materialistic -0.048957378 +-0.6942867 materialistic inclined -0.048957378 +-2.7309246 with diminished -0.048957378 +-3.28005 of understating -0.048957378 +-2.5958927 with our -0.048957378 +-2.6613982 be our -0.048957378 +-3.11635 , our -0.048957378 +-1.9867003 In our -0.048957378 +-2.9009323 in our -0.048957378 +-2.9576278 of our -0.048957378 +-3.0623753 to our -0.048957378 +-2.4881823 from our -0.048957378 +-2.123871 on our -0.048957378 +-2.3032832 fs our -0.048957378 +-2.0069056 make our -0.048957378 +-1.3812549 improve our -0.048957378 +-1.1642387 fulfill our -0.048957378 +-1.45804 around our -0.048957378 +-1.5226038 All our -0.048957378 +-0.69200426 understating our -0.048957378 +-1.2867545 its our -0.048957378 +-0.69200426 shows our -0.048957378 +-1.1710447 human beings -0.048957378 +-2.8176763 time ... -0.048957378 +-2.0747159 that we -0.04895735 +-2.570245 job we -0.048957378 +-2.2481103 , we -0.048957378 +-2.5689516 as we -0.048957378 +-1.9277668 reason we -0.048957378 +-1.8939558 jobs we -0.048957378 +-2.221037 school we -0.048957378 +-1.700656 if we -0.187397 +-2.030456 then we -0.048957378 +-2.4388337 money we -0.048957378 +-2.2005894 so we -0.048957378 +-1.621705 College we -0.048957378 +-1.0738788 when we -0.04895735 +-2.2028208 what we -0.048957378 +-2.0277584 If we -0.048957378 +-1.9283803 believe we -0.13618872 +-1.4172114 where we -0.048957378 +-1.7068932 before we -0.048957378 +-1.5160207 until we -0.048957378 +-1.3764908 system we -0.048957378 +-1.8815417 think we -0.048957378 +-1.6670213 age we -0.048957378 +-1.2829179 When we -0.048957378 +-1.4523605 everything we -0.048957378 +-0.69102556 whenever we -0.048957378 +-2.387195 do sooner -0.048957378 +-2.4342768 this opinion -0.20946135 +-2.5411224 the opinion -0.13618872 +-2.3543274 The opinion -0.048957378 +-1.766022 personal opinion -0.048957378 +-0.9614261 my opinion -0.12260215 +-3.4650414 to rush -0.048957378 +-2.8983755 are concentrated -0.048957378 +-0.99501497 Without concentrated -0.048957378 +-2.4081247 by its -0.048957378 +-2.5208 at its -0.048957378 +-1.6444623 since its -0.048957378 +-1.294531 determine its -0.048957378 +-2.4568932 for us -0.048957378 +-1.5920713 helps us -0.187397 +-1.3895458 show us -0.048957378 +-1.5348523 let us -0.048957378 +-1.3895458 allow us -0.048957378 +-1.4679527 around us -0.048957378 +-2.3542027 study causes -0.048957378 +-1.6458207 main causes -0.048957378 +-2.0497692 smoke causes -0.048957378 +-1.1710447 human unhappiness -0.048957378 +-3.4204583 the love -0.048957378 +-3.4204583 the sole -0.048957378 +-2.2215545 a few -0.099168696 +-1.9459207 how few -0.048957378 +-1.3916435 last few -0.048957378 +-1.4715303 few bucks -0.048957378 +-2.7477093 a distraction -0.048957378 +-1.1708648 obvious distraction -0.048957378 +-3.106568 in further -0.048957378 +-2.7398808 and further -0.048957378 +-1.1706803 contributing further -0.048957378 +-1.2958019 further cloud -0.048957378 +-3.4204583 the tricky -0.048957378 +-0.6942867 tricky monetary -0.048957378 +-0.6942867 monetary enslavement -0.048957378 +-2.7167566 and live -0.048957378 +-3.3489213 to live -0.048957378 +-2.5439303 should live -0.048957378 +-2.0068414 well live -0.048957378 +-2.276979 we live -0.048957378 +-2.6468444 student under -0.048957378 +-1.4711709 live under -0.048957378 +-1.2958019 short span -0.048957378 +-1.5961916 between childhood -0.048957378 +-3.4650414 to assure -0.048957378 +-3.246793 and mental -0.048957378 +-2.4103017 more mental -0.048957378 +-2.393125 fs mental -0.048957378 +-1.1710447 mental stability -0.048957378 +-2.7708678 Every -0.13618872 +-2.4119422 for his -0.048957378 +-2.1990962 in his -0.13618872 +-2.8984869 and his -0.048957378 +-2.3423495 of his -0.04895735 +-2.771218 to his -0.048957378 +-2.1259377 on his -0.048957378 +-2.0556114 -LRB- his -0.048957378 +-1.8452199 pay his -0.048957378 +-2.2817597 when his -0.048957378 +-1.8918848 support his -0.048957378 +-1.3819399 manage his -0.048957378 +-1.3819399 allow his -0.048957378 +-1.5235524 lose his -0.048957378 +-0.6921442 assert his -0.048957378 +-0.6921442 starting his -0.048957378 +-0.6921442 corrupts his -0.048957378 +-0.6921442 focuses his -0.048957378 +-2.4612203 for her -0.048957378 +-3.1933482 of her -0.048957378 +-3.3489213 to her -0.048957378 +-2.678043 or her -0.048957378 +-1.5351007 lose her -0.048957378 +-3.09512 No -0.048957378 +-0.995135 No deviation -0.048957378 +-2.7580705 it allowed -0.048957378 +-2.859086 be allowed -0.13618872 +-2.8837602 are allowed -0.048957378 +-3.120598 is absolutely -0.048957378 +-1.5378767 am absolutely -0.048957378 +-2.7796392 job executed -0.048957378 +-2.104077 any conditions -0.048957378 +-1.8378968 necessary conditions -0.048957378 +-3.644728 Studying -0.048957378 +-2.7044797 can influence -0.048957378 +-1.3923441 negative influence -0.048957378 +-3.102048 is sometimes -0.048957378 +-3.4625692 , sometimes -0.048957378 +-3.246793 and sometimes -0.048957378 +-2.401619 fs fate -0.048957378 +-3.09512 Depending -0.187397 +-3.4204583 the destiny -0.048957378 +-2.5628006 the entire -0.04895735 +-3.0382125 for generations -0.048957378 +-2.3987331 fs generations -0.048957378 +-2.3729722 The damage -0.048957378 +-0.99501497 consequently damage -0.048957378 +-2.880715 be irreversible -0.048957378 +-1.647248 bad qualified -0.048957378 +-0.995135 productivity causing -0.048957378 +-1.1710447 entire nation -0.048957378 +-2.174259 Even -0.27406517 +-1.9456341 no direct -0.048957378 +-1.3923441 result direct -0.048957378 +-1.6474046 possible wastes -0.048957378 +-3.4204583 the production -0.048957378 +-2.219547 reasons above -0.048957378 +-2.8665538 the above -0.13618872 +-0.69410884 stated above -0.048957378 +-3.4000094 the institutions -0.048957378 +-2.454793 all institutions -0.048957378 +-2.052458 first priority -0.048957378 +-3.0230536 their agendas -0.048957378 +-0.995135 No excuses -0.048957378 +-2.1054032 any delay -0.048957378 +-3.28005 of action -0.048957378 +-2.880715 be accepted -0.048957378 +-2.928319 that right -0.048957378 +-3.1356838 a right -0.048957378 +-2.7126353 job right -0.048957378 +-2.0256789 the right -0.0796529 +-3.0886452 and right -0.048957378 +-2.9101167 their right -0.048957378 +-2.228456 education right -0.048957378 +-1.1684108 truly right -0.048957378 +-2.880715 be assured -0.048957378 +-1.1710447 perfect self-improvement -0.048957378 +-3.3047535 a worthwhile -0.048957378 +-1.8956895 It serves -0.048957378 +-1.797456 important element -0.048957378 +-3.4204583 the preparatory -0.048957378 +-3.2358162 a stage -0.048957378 +-2.3708131 important stage -0.048957378 +-0.69396824 preparatory stage -0.048957378 +-0.69396824 curious stage -0.048957378 +-2.5885956 from country -0.048957378 +-2.2866669 one country -0.048957378 +-0.69410884 conscious country -0.048957378 +-3.1293185 in Asia -0.048957378 +-0.6942867 Asia whereby -0.048957378 +-1.5383809 feel obliged -0.048957378 +-1.647248 financially self-supporting -0.048957378 +-3.3677967 , save -0.048957378 +-3.1451926 and save -0.048957378 +-2.5737934 to save -0.13618872 +-2.8121924 they save -0.048957378 +-1.6417291 families save -0.048957378 +-1.5913469 both save -0.048957378 +-3.2979217 and augment -0.048957378 +-3.2908964 a whole -0.048957378 +-2.1194625 This whole -0.048957378 +-2.543778 money among -0.048957378 +-1.645653 independence among -0.048957378 +-0.69410884 resistance among -0.048957378 +-2.878185 the youth -0.048957378 +-2.3729722 The youth -0.048957378 +-2.569465 On -0.20946135 +-3.0728445 in his\/her -0.048957378 +-3.1933482 of his\/her -0.048957378 +-2.0428584 make his\/her -0.048957378 +-1.9155744 support his\/her -0.048957378 +-1.4696376 reduce his\/her -0.048957378 +-2.7708824 it promotes -0.048957378 +-3.2979217 and duties -0.048957378 +-0.6942867 duties assigned -0.048957378 +-3.4000094 the virtues -0.048957378 +-2.3779159 important virtues -0.048957378 +-2.7396104 as hard-work -0.048957378 +-2.6395223 , teamwork -0.27955192 +-2.709418 with respect -0.048957378 +-3.2102208 and respect -0.048957378 +-3.3813157 to respect -0.048957378 +-0.69396824 much-needed respect -0.048957378 +-3.0452757 for authority -0.048957378 +-2.569465 Although -0.048957378 +-3.2979217 and authorities -0.048957378 +-3.6294703 schools -0.048957378 +-3.014355 their schools -0.048957378 +-3.1828704 a local -0.048957378 +-2.8272688 the local -0.048957378 +-3.1451926 and local -0.048957378 +-2.9430885 their local -0.048957378 +-2.7028904 as local -0.048957378 +-2.5105898 at local -0.048957378 +-1.5382957 local governments -0.048957378 +-2.880715 be proactive -0.048957378 +-3.2799814 and setting -0.048957378 +-2.3317432 when setting -0.048957378 +-2.5599043 the appropriate -0.13618872 +-2.4667776 an appropriate -0.048957378 +-2.4409037 this policy -0.048957378 +-3.2358162 a policy -0.048957378 +-1.1700908 appropriate policy -0.048957378 +-2.1457708 smoking policy -0.048957378 +-1.3925304 policy measures -0.048957378 +-3.4931862 , avoid -0.048957378 +-3.4479775 to avoid -0.048957378 +-2.2803774 some pitfalls -0.048957378 +-2.9492288 , including -0.048957378 +-1.5946901 expenses including -0.048957378 +-0.69410884 pitfalls including -0.048957378 +-1.2958019 including exploitation -0.048957378 +-3.2979217 and abuse -0.048957378 +-3.5096595 , excessive -0.048957378 +-3.2979217 and absenteeism -0.048957378 +-3.2979217 and gross -0.048957378 +-0.6942867 gross negligence -0.048957378 +-3.014355 their schooling -0.048957378 +-1.9687632 our schooling -0.048957378 +-1.6467657 main purposes -0.048957378 +-0.6942267 relaxation purposes -0.048957378 +-1.8381437 : i -0.048957378 +-2.1533551 -RRB- attain -0.048957378 +-1.8383849 right competency -0.048957378 +-3.2799814 and attitude -0.048957378 +-3.014355 their attitude -0.048957378 +-2.873293 be desired -0.048957378 +-2.4549398 their desired -0.048957378 +-1.1710447 desired expertise -0.048957378 +-3.5096595 , ii -0.048957378 +-2.152201 -RRB- finish -0.048957378 +-2.8497334 they finish -0.048957378 +-2.7320588 not lose -0.048957378 +-2.8836398 to lose -0.048957378 +-2.3815436 may lose -0.048957378 +-1.5359426 even lose -0.048957378 +-1.4687891 eventually lose -0.048957378 +-1.5378767 lose sight -0.048957378 +-1.5957409 complete sight -0.048957378 +-1.5961065 potential gains -0.048957378 +-3.3624787 the four -0.048957378 +-2.9977748 their four -0.048957378 +-2.2278552 these four -0.048957378 +-1.17113 four corners -0.048957378 +-2.8842342 the classroom -0.048957378 +-3.0907614 Thus -0.187397 +-0.6942267 h. Thus -0.048957378 +-3.1586366 a balance -0.048957378 +-3.2177634 the balance -0.048957378 +-2.377721 to balance -0.048957378 +-2.0400174 must balance -0.048957378 +-2.1349201 social balance -0.048957378 +-2.3897834 life balance -0.048957378 +-1.8294846 right balance -0.048957378 +-2.7713203 not necessarily -0.048957378 +-2.3748312 many points -0.048957378 +-1.5961901 three points -0.048957378 +-3.4000094 the argument -0.048957378 +-1.5381981 second argument -0.048957378 +-1.2958871 end unless -0.048957378 +-2.8842342 the decision -0.048957378 +-3.4204583 the attainment -0.048957378 +-3.2676826 of organizational -0.048957378 +-2.3987331 fs organizational -0.048957378 +-3.2799814 and socially -0.048957378 +-0.99501497 communicate socially -0.048957378 +-3.321535 the current -0.048957378 +-3.3813157 to current -0.048957378 +-2.9787707 their current -0.048957378 +-2.3608074 The current -0.048957378 +-2.3961296 may consist -0.048957378 +-3.120598 is mainly -0.048957378 +-0.6942267 consist mainly -0.048957378 +-3.102048 is fast -0.048957378 +-3.244391 of fast -0.048957378 +-1.1706803 moving fast -0.048957378 +-2.7186737 or manual -0.048957378 +-2.709418 with invaluable -0.048957378 +-2.326847 are invaluable -0.048957378 +-1.9419398 provide invaluable -0.048957378 +-1.1700908 gaining invaluable -0.048957378 +-2.401619 fs targeted -0.048957378 +-2.7708678 Balancing -0.04895735 +-2.7857137 work schedules -0.048957378 +-2.7024844 or schedules -0.048957378 +-1.8370627 class schedules -0.048957378 +-3.2979217 and arranging -0.048957378 +-3.2799814 and effective -0.048957378 +-1.9459537 provide effective -0.048957378 +-2.672149 can assist -0.048957378 +-2.1393735 will assist -0.187397 +-3.1451926 and assist -0.048957378 +-3.3187766 to assist -0.048957378 +-2.484846 jobs assist -0.048957378 +-2.5385072 should assist -0.048957378 +-3.0728445 in becoming -0.048957378 +-3.1764908 and becoming -0.048957378 +-3.1933482 of becoming -0.048957378 +-3.3489213 to becoming -0.048957378 +-1.1696702 fast becoming -0.048957378 +-3.321535 the daily -0.048957378 +-2.728165 and daily -0.187397 +-2.9787707 their daily -0.048957378 +-1.7681487 enjoy daily -0.048957378 +-1.5961065 both presently -0.048957378 +-2.9150915 to prepare -0.13618872 +-2.2324088 help prepare -0.048957378 +-3.0382125 for car -0.048957378 +-3.2908964 a car -0.048957378 +-1.6926183 start post -0.048957378 +-0.99501497 car post -0.048957378 +-1.733112 graduation careers -0.048957378 +-2.4503036 their careers -0.048957378 +-1.645653 professional careers -0.048957378 +-2.880715 be passed -0.048957378 +-2.873293 be extremely -0.048957378 +-2.0867398 become extremely -0.048957378 +-3.644728 Potential -0.048957378 +-2.880715 be impressed -0.048957378 +-2.532656 at interviews -0.048957378 +-3.4479775 to cities -0.048957378 +-0.6942267 multicultural cities -0.048957378 +-1.1710447 lower socio-economic -0.048957378 +-0.6942867 socio-economic backgrounds -0.048957378 +-3.4650414 to partly -0.048957378 +-2.6641047 can complete -0.048957378 +-3.1586366 a complete -0.048957378 +-3.2177634 the complete -0.048957378 +-2.90795 , complete -0.048957378 +-3.2905889 to complete -0.048957378 +-1.5331545 actually complete -0.048957378 +-1.5331545 lose complete -0.048957378 +-2.0139093 society professionally -0.048957378 +-2.469975 an educated -0.048957378 +-2.859086 be contributing -0.048957378 +-3.4625692 , contributing -0.048957378 +-3.246793 and contributing -0.048957378 +-2.0330791 better prepared -0.048957378 +-2.8842342 the challenges -0.048957378 +-2.7567537 college ends -0.048957378 +-2.672149 can develop -0.048957378 +-2.8730767 to develop -0.04895735 +-2.933356 students develop -0.048957378 +-2.392312 also develop -0.048957378 +-2.8121924 they develop -0.048957378 +-2.6663196 or develop -0.048957378 +-2.2898738 one stronger -0.048957378 +-1.5957409 develop stronger -0.048957378 +-2.2445176 When -0.1765346 +-3.5096595 , appropriately -0.048957378 +-0.6942867 appropriately dressed -0.048957378 +-1.8062247 real consequences -0.048957378 +-1.5382957 her actions -0.048957378 +-3.644728 She -0.048957378 +-2.7044797 can wait -0.048957378 +-3.4479775 to wait -0.048957378 +-2.0523734 make naive -0.048957378 +-0.6942867 naive mistakes -0.048957378 +-2.4476333 this critical -0.048957378 +-3.102048 is critical -0.048957378 +-3.244391 of critical -0.048957378 +-3.1293185 in favor -0.048957378 +-2.4247963 a variety -0.27955192 +-1.1708648 wider variety -0.048957378 +-3.014355 their classmates -0.048957378 +-1.1708648 again classmates -0.048957378 +-2.1669977 but maybe -0.048957378 +-2.7852879 time together -0.048957378 +-2.7663805 work together -0.048957378 +-0.9942178 communicating together -0.048957378 +-0.6938276 Living together -0.048957378 +-0.6938276 gather together -0.048957378 +-3.2799814 and thinking -0.048957378 +-3.2676826 of thinking -0.048957378 +-2.1214447 much closer -0.048957378 +-2.2804153 get along -0.048957378 +-1.8225902 my belief -0.187397 +-3.4479775 to recognize -0.048957378 +-2.5596967 should recognize -0.048957378 +-2.7187858 with lectures -0.048957378 +-3.106568 in lectures -0.048957378 +-3.244391 of lectures -0.048957378 +-3.4931862 , meetings -0.048957378 +-1.295562 hour meetings -0.048957378 +-3.09512 Otherwise -0.048957378 +-2.5982223 from poorer -0.048957378 +-3.3047535 a disadvantage -0.048957378 +-0.995135 disadvantage compared -0.048957378 +-2.5949488 from wealthy -0.048957378 +-2.8983755 are wealthy -0.048957378 +-2.6886988 can pass -0.048957378 +-3.428628 , pass -0.048957378 +-3.2102208 and pass -0.048957378 +-3.3813157 to pass -0.048957378 +-3.3624787 the ever -0.048957378 +-2.178507 has ever -0.048957378 +-1.2834088 without ever -0.048957378 +-3.4931862 , experiencing -0.048957378 +-1.295562 ever experiencing -0.048957378 +-3.106568 in regular -0.048957378 +-2.7283971 as regular -0.048957378 +-0.9947796 experiencing regular -0.048957378 +-2.7972748 have empathy -0.048957378 +-2.5593464 in modern -0.187397 +-2.3729722 The modern -0.048957378 +-3.1293185 in politics -0.048957378 +-3.2979217 and justice -0.048957378 +-3.644728 Universities -0.048957378 +-2.181358 could award -0.048957378 +-0.6942867 award credits -0.048957378 +-2.387195 do volunteer -0.048957378 +-3.4204583 the all-round -0.048957378 +-2.562118 should count -0.048957378 +-1.4698606 especially towards -0.048957378 +-1.294531 particularly towards -0.048957378 +-0.9944986 attitude towards -0.048957378 +-0.69396824 count towards -0.048957378 +-3.0230536 their degrees -0.048957378 +-3.644728 School -0.048957378 +-3.102048 is expensive -0.048957378 +-2.1624105 very expensive -0.048957378 +-2.7283971 as expensive -0.048957378 +-2.5672007 is costly -0.048957378 +-2.5119796 jobs assisting -0.048957378 +-1.7693542 For busy -0.048957378 +-2.8837602 are busy -0.048957378 +-0.69410884 assisting busy -0.048957378 +-1.1710447 busy professionals -0.048957378 +-1.969951 other sources -0.048957378 +-2.7368789 I doubt -0.048957378 +-1.9456341 no doubt -0.048957378 +-2.1511729 I spent -0.048957378 +-2.7969828 be spent -0.048957378 +-2.709646 not spent -0.048957378 +-2.2275927 time spent -0.048957378 +-2.7196925 college spent -0.048957378 +-2.0178785 better spent -0.048957378 +-1.9363589 years spent -0.048957378 +-3.2979217 and figuring -0.048957378 +-2.859086 be personally -0.048957378 +-3.102048 is personally -0.048957378 +-1.5946901 both personally -0.048957378 +-3.4204583 the usual -0.048957378 +-2.0431418 valuable form -0.048957378 +-3.284121 the form -0.048957378 +-2.267613 some form -0.048957378 +-1.961792 other form -0.048957378 +-0.6938276 feedback form -0.048957378 +-3.0178597 that profession -0.048957378 +-2.873293 be nice -0.048957378 +-3.2908964 a nice -0.048957378 +-3.4650414 to speed -0.048957378 +-2.2804153 get certified -0.048957378 +-3.2799814 and receive -0.048957378 +-2.9150915 to receive -0.048957378 +-1.6466527 degree faster -0.048957378 +-1.2956754 expectations faster -0.048957378 +-3.4650414 to raise -0.048957378 +-2.1238894 their children -0.048957378 +-2.3608074 The children -0.048957378 +-2.0457513 his children -0.048957378 +-0.69396824 longer children -0.048957378 +-2.5790641 to choose -0.04895735 +-2.821667 they choose -0.048957378 +-2.2780018 who choose -0.048957378 +-2.328002 would choose -0.048957378 +-1.766022 always choose -0.048957378 +-3.4650414 to interact -0.048957378 +-3.4625692 , community -0.048957378 +-2.7024844 or community -0.048957378 +-1.5370556 local community -0.048957378 +-1.3925304 various epeople-skills -0.048957378 +-1.82423 I had -0.048957378 +-2.668018 not had -0.048957378 +-3.0156138 and had -0.048957378 +-2.378885 students had -0.048957378 +-2.1354082 but had -0.048957378 +-2.693505 college had -0.048957378 +-2.7056131 have had -0.048957378 +-2.7676923 they had -0.048957378 +-2.254748 who had -0.048957378 +-1.1671549 myself had -0.048957378 +-1.5858314 someone had -0.048957378 +-1.8678414 was 14 -0.048957378 +-2.410627 also old -0.048957378 +-2.8837602 are old -0.048957378 +-1.944475 years old -0.048957378 +-1.5382957 local pharmacy -0.048957378 +-2.0522041 than capable -0.048957378 +-2.4667776 an appreciation -0.048957378 +-0.6942267 Early appreciation -0.048957378 +-3.0607083 is required -0.048957378 +-1.4696376 level required -0.048957378 +-2.8507237 are required -0.048957378 +-1.4687891 effort required -0.048957378 +-0.9942178 absolutely required -0.048957378 +-2.4199624 by interacting -0.048957378 +-2.85306 the public -0.048957378 +-3.089379 in public -0.048957378 +-3.2102208 and public -0.048957378 +-2.4438105 all public -0.048957378 +-3.4650414 to juggle -0.048957378 +-2.1669266 very surprised -0.048957378 +-2.052458 first came -0.048957378 +-2.7044797 can bring -0.048957378 +-2.8497334 they bring -0.048957378 +-2.4184146 more maturity -0.048957378 +-3.2649243 a currently -0.048957378 +-2.8837602 are currently -0.048957378 +-1.865671 Japan currently -0.048957378 +-3.0808864 is purely -0.048957378 +-1.6444623 just purely -0.048957378 +-0.69396824 devoted purely -0.048957378 +-0.69396824 facilities purely -0.048957378 +-3.644728 Hence -0.048957378 +-3.6294703 1 -0.048957378 +-2.8983755 are 1 -0.048957378 +-2.3772526 many interesting -0.048957378 +-1.9466504 no replacement -0.048957378 +-3.601002 2 -0.048957378 +-3.4625692 , 2 -0.048957378 +-2.7835681 have 2 -0.048957378 +-2.9793792 for everything -0.048957378 +-3.3187766 to everything -0.048957378 +-2.090273 then everything -0.048957378 +-2.7526727 have everything -0.048957378 +-2.0895708 If everything -0.048957378 +-1.16925 almost everything -0.048957378 +-3.0808864 is given -0.048957378 +-2.8669279 are given -0.048957378 +-2.09662 If given -0.048957378 +-1.5935729 been given -0.048957378 +-3.601002 3 -0.048957378 +-3.246793 and 3 -0.048957378 +-0.9947796 next 3 -0.048957378 +-3.0178597 that relates -0.048957378 +-3.644728 Sometimes -0.048957378 +-3.3047535 a salary -0.048957378 +-2.7516315 a resume -0.04895735 +-3.6294703 4 -0.048957378 +-1.295562 short 4 -0.048957378 +-3.2979217 and mentors -0.048957378 +-3.644728 Thank -0.048957378 +-2.7267487 will explain -0.048957378 +-2.7708678 Reason -0.04895735 +-2.920025 to survive -0.048957378 +-2.569465 With -0.048957378 +-3.3047535 a struggle -0.048957378 +-3.4204583 the continual -0.048957378 +-2.3748312 many excellent -0.048957378 +-2.4667776 an excellent -0.048957378 +-2.8272688 the long -0.048957378 +-2.7526727 have long -0.048957378 +-2.8351026 are long -0.048957378 +-2.3940783 life long -0.048957378 +-1.4689857 impact long -0.048957378 +-0.69368714 Before long -0.048957378 +-1.5382957 long length -0.048957378 +-3.2676826 of vacation -0.048957378 +-1.5378767 long vacation -0.048957378 +-3.644728 Which -0.048957378 +-2.873293 be greatly -0.048957378 +-2.4158804 also greatly -0.048957378 +-2.0510054 often wasted -0.048957378 +-0.99501497 greatly wasted -0.048957378 +-2.1669266 very lazy -0.048957378 +-3.2799814 and unproductive -0.048957378 +-3.4479775 to unproductive -0.048957378 +-3.644728 Or -0.048957378 +-1.4711709 late nights -0.048957378 +-1.4711709 few nights -0.048957378 +-3.0452757 for travel -0.048957378 +-2.1071095 young persons -0.048957378 +-2.380017 fs growth -0.048957378 +-1.766022 personal growth -0.048957378 +-1.1696702 mental growth -0.048957378 +-0.6938276 persons growth -0.048957378 +-0.9942178 sexual growth -0.048957378 +-2.0522888 often multicultural -0.048957378 +-3.246793 and off -0.048957378 +-2.0111368 well off -0.048957378 +-1.1706803 paid off -0.048957378 +-1.647248 statement eIt -0.048957378 +-3.2979217 and employability -0.048957378 +-1.6470779 same field\/industry -0.048957378 +-2.8842342 the ereal -0.048957378 +-2.7713203 not suited -0.048957378 +-2.7181385 I look -0.048957378 +-2.5843997 to look -0.13618872 +-2.5494218 should look -0.048957378 +-1.6897244 employers look -0.048957378 +-3.4000094 the path -0.048957378 +-1.9218646 career path -0.048957378 +-3.0230536 their homework\/assignments -0.048957378 +-2.569465 At -0.11268411 +-2.0524554 valuable input -0.048957378 +-3.2102208 and using -0.048957378 +-3.21812 of using -0.048957378 +-2.7197275 as using -0.048957378 +-2.8669279 are using -0.048957378 +-3.014355 their initiative -0.048957378 +-1.9455216 own initiative -0.048957378 +-1.5382957 problem solving -0.048957378 +-3.5096595 , communication -0.048957378 +-2.5599043 the customer -0.13618872 +-3.4931862 , customer -0.048957378 +-3.0230536 their uniforms -0.048957378 +-2.4324028 Being -0.099168696 +-2.5019197 part-time brings -0.048957378 +-3.4931862 , brings -0.048957378 +-2.880715 be pointed -0.048957378 +-1.9468912 provide real-world -0.048957378 +-2.708228 can complement -0.048957378 +-2.569465 Also -0.20946135 +-3.1880362 the spending -0.048957378 +-2.698859 not spending -0.048957378 +-3.0886452 and spending -0.048957378 +-1.8307188 extra spending -0.048957378 +-2.0729342 out spending -0.048957378 +-1.9314854 own spending -0.048957378 +-1.292295 towards spending -0.048957378 +-0.6934062 unreasonable spending -0.048957378 +-2.5531027 is certainly -0.048957378 +-2.7047877 will certainly -0.048957378 +-3.428628 , certainly -0.048957378 +-1.6444623 degree certainly -0.048957378 +-2.859086 be welcome -0.048957378 +-3.2649243 a welcome -0.048957378 +-1.7693542 always welcome -0.048957378 +-3.2085369 a choice -0.048957378 +-2.109144 This choice -0.048957378 +-1.766022 personal choice -0.048957378 +-1.5924584 best choice -0.048957378 +-0.9942178 wise choice -0.048957378 +-2.208895 which complements -0.048957378 +-3.5096595 , looking -0.048957378 +-2.7663805 work within -0.048957378 +-2.8507237 are within -0.048957378 +-1.2942703 burden within -0.048957378 +-1.3905429 together within -0.048957378 +-2.1427004 smoking within -0.048957378 +-3.0382125 for lab -0.048957378 +-3.2908964 a lab -0.048957378 +-3.2649243 a research -0.048957378 +-3.244391 of research -0.048957378 +-1.8357562 A research -0.048957378 +-1.1710447 research assistant -0.048957378 +-3.2908964 a professor -0.048957378 +-3.014355 their professor -0.048957378 +-3.3047535 a well-paying -0.048957378 +-3.2979217 and conveniently -0.048957378 +-0.6942867 conveniently located -0.048957378 +-3.4204583 the content -0.048957378 +-2.880715 be desirable -0.048957378 +-2.859086 be interested -0.048957378 +-2.9982424 that interested -0.048957378 +-2.7556913 not interested -0.048957378 +-2.104994 -LRB- although -0.048957378 +-2.4562922 this scenario -0.048957378 +-0.6942867 scenario leads -0.048957378 +-1.1710447 certain existentially -0.048957378 +-0.6942867 existentially themed -0.048957378 +-0.6942867 themed questions -0.048957378 +-3.644728 Regardless -0.048957378 +-2.7665153 it profit -0.048957378 +-1.7332089 still profit -0.048957378 +-3.2979217 and prioritize -0.048957378 +-3.120598 is merit -0.048957378 +-2.2786872 some merit -0.048957378 +-2.4266567 a significant -0.04895735 +-1.837133 high expense -0.048957378 +-1.1705118 significant expense -0.048957378 +-1.5946901 large expense -0.048957378 +-3.4204583 the capital -0.048957378 +-2.9583905 , thereby -0.048957378 +-2.749941 and thereby -0.048957378 +-3.2799814 and cover -0.048957378 +-2.5943775 to cover -0.04895735 +-3.3047535 a bank -0.048957378 +-2.6392057 student loan -0.048957378 +-3.246793 and loan -0.048957378 +-0.69410884 bank loan -0.048957378 +-2.121458 even mortgaging -0.048957378 +-3.0230536 their house -0.048957378 +-3.246793 and puts -0.048957378 +-2.1163917 This puts -0.048957378 +-0.9947796 institutes puts -0.048957378 +-3.28005 of strain -0.048957378 +-2.7309246 with financing -0.048957378 +-3.428628 , materials -0.048957378 +-3.2102208 and materials -0.048957378 +-2.349459 study materials -0.048957378 +-1.469627 educational materials -0.048957378 +-2.9631343 , reducing -0.187397 +-2.5628006 the pressure -0.13618872 +-3.2979217 and encouraging -0.048957378 +-2.4168756 a large -0.099168696 +-2.1450179 most large -0.048957378 +-1.9487343 at large -0.048957378 +-0.9944986 relatively large -0.048957378 +-1.5961065 large percentage -0.187397 +-3.28005 of shoppers -0.048957378 +-1.9701205 take notice -0.048957378 +-3.2676826 of changes -0.048957378 +-1.8669469 environment changes -0.048957378 +-3.1293185 in fashion -0.048957378 +-3.2979217 and technology -0.048957378 +-2.6972163 can buy -0.048957378 +-3.4163227 to buy -0.048957378 +-2.2396004 only buy -0.048957378 +-2.8842342 the latest -0.048957378 +-0.995135 latest products -0.048957378 +-1.2958019 thereby reviving -0.048957378 +-1.5382957 local shops -0.048957378 +-2.859086 be low -0.048957378 +-3.2649243 a low -0.048957378 +-3.102048 is low -0.048957378 +-2.2909184 so shop -0.048957378 +-0.6942867 shop owners -0.048957378 +-0.99501497 savings onto -0.048957378 +-1.8691914 going onto -0.048957378 +-3.2391717 to me -0.048957378 +-1.6391098 benefit me -0.048957378 +-1.3874582 allow me -0.048957378 +-1.5880291 made me -0.048957378 +-1.4654533 around me -0.048957378 +-1.1685529 puts me -0.048957378 +-0.6932658 enabled me -0.048957378 +-0.6932658 provided me -0.048957378 +-0.6932658 gave me -0.048957378 +-2.1937845 it seems -0.048957378 +-1.8947358 It seems -0.048957378 +-2.722524 will carry -0.048957378 +-3.2799814 and carry -0.048957378 +-1.3925304 various task -0.048957378 +-3.2358162 a goal -0.048957378 +-2.3608074 The goal -0.048957378 +-0.40941563 primary goal -0.048957378 +-0.69396824 original goal -0.048957378 +-2.859086 be rewarding -0.048957378 +-2.1624105 very rewarding -0.048957378 +-1.5370556 quite rewarding -0.048957378 +-3.4650414 to mention -0.048957378 +-3.644728 Basic -0.048957378 +-3.644728 Looking -0.048957378 +-3.2799814 and seeing -0.048957378 +-2.732904 on seeing -0.048957378 +-3.4650414 to payback -0.048957378 +-3.2102208 and over -0.048957378 +-1.8906534 little over -0.048957378 +-0.9944986 preferred over -0.048957378 +-0.9944986 carry over -0.048957378 +-2.1220925 there everyday -0.048957378 +-2.1050158 into everyday -0.048957378 +-1.3925304 show irresponsible -0.048957378 +-1.4715303 eventually breakdown -0.048957378 +-2.4476333 this economic -0.048957378 +-3.3624787 the economic -0.048957378 +-3.244391 of economic -0.048957378 +-3.1303647 is happing -0.048957378 +-3.644728 Using -0.048957378 +-3.3047535 a false -0.048957378 +-1.4721353 credit card -0.048957378 +-2.9060166 are constantly -0.048957378 +-0.6942867 constantly pushing -0.048957378 +-1.9691006 take care -0.187397 +-1.2956754 taken care -0.048957378 +-3.4204583 the bill -0.048957378 +-2.842708 be banned -0.21607319 +-3.0808864 is banned -0.048957378 +-1.7308546 were banned -0.048957378 +-1.469627 already banned -0.048957378 +-2.7312138 a number -0.187397 +-3.321535 the number -0.048957378 +-1.5935729 large number -0.048957378 +-0.9944986 total number -0.187397 +-1.7087959 who goes -0.187397 +-1.8669469 environment goes -0.048957378 +-2.1655056 you didn -0.048957378 +-1.3922309 University didn -0.048957378 +-3.3047535 a dime -0.048957378 +-1.3925304 University tuitions -0.048957378 +-3.2908964 a summer -0.048957378 +-3.014355 their summer -0.048957378 +-0.995135 summer vacations -0.048957378 +-3.4204583 the summers -0.048957378 +-2.7708678 Furthermore -0.27955192 +-2.349459 study schedule -0.048957378 +-2.2162788 work schedule -0.187397 +-1.8024169 real schedule -0.048957378 +-1.8356878 class schedule -0.048957378 +-2.1656544 I got -0.048957378 +-1.2240654 after graduating -0.04895735 +-3.102048 is already -0.048957378 +-1.6885092 have already -0.048957378 +-0.9947796 fd already -0.048957378 +-1.4715303 already experienced -0.048957378 +-2.8842342 the requirements -0.187397 +-2.5483203 money c -0.048957378 +-2.0129478 well c -0.048957378 +-2.873293 be seen -0.048957378 +-0.6942267 plainly seen -0.048957378 +-3.0090466 for extra-curricular -0.048957378 +-3.428628 , extra-curricular -0.048957378 +-3.21812 of extra-curricular -0.048957378 +-1.9642754 other extra-curricular -0.048957378 +-1.7342885 before hiring -0.048957378 +-3.3047535 a bubble -0.048957378 +-1.6920769 upon leaving -0.048957378 +-3.4625692 , leaving -0.048957378 +-2.1369052 after leaving -0.048957378 +-3.2908964 a head -0.048957378 +-2.3987331 fs head -0.048957378 +-3.089379 in creating -0.048957378 +-3.2102208 and creating -0.048957378 +-2.1138942 even creating -0.048957378 +-2.1003318 into creating -0.048957378 +-3.4204583 the failures -0.048957378 +-2.182114 has realistic -0.048957378 +-2.8176763 time adjusting -0.048957378 +-2.7983925 work \/ -0.048957378 +-3.284121 the employees -0.048957378 +-1.8010415 restaurant employees -0.048957378 +-2.762729 have employees -0.048957378 +-1.6881027 These employees -0.048957378 +-1.2948219 fellow employees -0.048957378 +-3.0178597 that wont -0.048957378 +-0.6942867 wont burn -0.048957378 +-3.4650414 to gauge -0.048957378 +-2.4199624 by assessing -0.048957378 +-3.4650414 to multi-task -0.048957378 +-3.5096595 , grade -0.048957378 +-1.5962766 point averages -0.048957378 +-2.7972748 have devised -0.048957378 +-3.28005 of formula -0.048957378 +-1.1710447 employer feels -0.048957378 +-3.2979217 and excels -0.048957378 +-2.469975 an assignment -0.048957378 +-2.1737337 Finally -0.70552063 +-1.295562 h Finally -0.048957378 +-3.3624787 the general -0.048957378 +-3.106568 in general -0.048957378 +-3.246793 and general -0.048957378 +-3.644728 Idle -0.048957378 +-0.6942867 Idle minds -0.048957378 +-2.6507893 student deserves -0.048957378 +-3.0452757 for needy -0.048957378 +-2.2911956 who face -0.048957378 +-3.3624787 the difficulties -0.048957378 +-3.244391 of difficulties -0.048957378 +-2.0098362 financial difficulties -0.048957378 +-3.644728 Incomes -0.048957378 +-1.3925304 basic necessities -0.048957378 +-3.4931862 , purchase -0.048957378 +-2.9767401 students purchase -0.048957378 +-3.4931862 , dedication -0.048957378 +-3.2799814 and dedication -0.048957378 +-1.2958019 towards unreasonable -0.048957378 +-3.4204583 the inherent -0.048957378 +-1.6926544 These structures -0.048957378 +-3.0178597 that exist -0.048957378 +-3.644728 Early -0.048957378 +-1.6926183 start formal -0.048957378 +-1.295562 using formal -0.048957378 +-1.2958019 thereby adjust -0.048957378 +-3.4479775 to demand -0.048957378 +-2.5091865 jobs demand -0.048957378 +-0.995135 demand involvement -0.048957378 +-3.644728 Both -0.048957378 +-2.4003265 my private -0.048957378 +-0.6942267 Both private -0.048957378 +-3.2676826 of utmost -0.048957378 +-3.014355 their utmost -0.048957378 +-2.031715 In New -0.048957378 +-3.1215048 in New -0.048957378 +-0.40954217 New Zealand -0.187397 +-3.0178597 that enabled -0.048957378 +-2.7972748 have continued -0.048957378 +-3.0808864 is tertiary -0.048957378 +-2.4081247 by tertiary -0.048957378 +-1.8191105 my tertiary -0.048957378 +-0.9944986 Attending tertiary -0.048957378 +-2.4028242 my monthly -0.048957378 +-3.0452757 for housing -0.048957378 +-3.0452757 for teenagers -0.048957378 +-3.4650414 to move -0.048957378 +-3.4650414 to spread -0.048957378 +-3.0230536 their wings -0.048957378 +-3.644728 Ultimately -0.048957378 +-2.1621325 I wanted -0.048957378 +-2.162571 you wanted -0.048957378 +-1.8930372 really wanted -0.048957378 +-3.4650414 to prove -0.048957378 +-3.28005 of yourself -0.048957378 +-3.644728 Naturally -0.048957378 +-2.5259967 at entertainment -0.048957378 +-1.5370556 simply entertainment -0.048957378 +-1.2950919 including entertainment -0.048957378 +-1.2958019 government sponsored -0.048957378 +-2.208895 which covered -0.048957378 +-2.1230183 there wasn -0.048957378 +-2.3099248 be left -0.048957378 +-2.5483203 money left -0.048957378 +-3.644728 During -0.048957378 +-3.3047535 a gas -0.048957378 +-0.6942867 gas station -0.048957378 +-0.6942267 station attendant -0.048957378 +-0.6942267 parking attendant -0.048957378 +-3.5096595 , kitchen -0.048957378 +-3.4931862 , video -0.048957378 +-3.2676826 of video -0.048957378 +-0.995135 video rental -0.048957378 +-0.6942867 rental clerk -0.048957378 +-3.5096595 , parking -0.048957378 +-2.7181385 I finally -0.048957378 +-3.428628 , finally -0.048957378 +-3.2102208 and finally -0.048957378 +-1.7685759 he finally -0.048957378 +-0.995135 hotel porter -0.048957378 +-2.5119796 jobs provided -0.048957378 +-2.0330791 better communicative -0.048957378 +-3.4650414 to organize -0.048957378 +-2.4184146 more efficiently -0.048957378 +-2.6392057 student opinions -0.048957378 +-2.2278552 these opinions -0.048957378 +-1.5955248 three opinions -0.187397 +-1.8057122 gain hands -0.048957378 +-1.5379899 obtain hands -0.048957378 +-3.2908964 a textbook -0.048957378 +-3.4000094 the textbook -0.048957378 +-2.6507893 student selected -0.048957378 +-2.3602743 study programs -0.048957378 +-3.5096595 , seeking -0.048957378 +-2.9060166 are unsure -0.048957378 +-3.2979217 and undecided -0.048957378 +-2.8842342 the direction -0.187397 +-2.4184146 more options -0.048957378 +-3.3047535 a brighter -0.048957378 +-3.09512 Third -0.187397 +-2.181358 could inspire -0.048957378 +-2.920025 to excel -0.048957378 +-3.0230536 their senior -0.048957378 +-3.644728 Work -0.048957378 +-3.4650414 to comprehend -0.048957378 +-3.09512 Based -0.187397 +-3.1293185 in discovering -0.048957378 +-3.3047535 a lifetime -0.048957378 +-2.9060166 are helping -0.048957378 +-3.4650414 to unmotivated -0.048957378 +-1.9466504 no sadder -0.048957378 +-0.6942867 sadder story -0.048957378 +-3.102048 is finished -0.048957378 +-2.178507 has finished -0.048957378 +-2.7835681 have finished -0.048957378 +-1.1710447 finished 16 -0.048957378 +-3.4650414 to run -0.048957378 +-2.7044797 can challenge -0.048957378 +-1.8055046 real challenge -0.048957378 +-2.7972748 have tried -0.048957378 +-2.42757 , dealing -0.40449065 +-2.4343994 experience dealing -0.048957378 +-2.7112274 as dealing -0.048957378 +-1.4690876 especially dealing -0.048957378 +-1.3902439 though dealing -0.048957378 +-3.3047535 a boss -0.048957378 +-2.569465 Those -0.048957378 +-3.09512 Parents -0.048957378 +-3.09512 Doing -0.048957378 +-2.418582 also shows -0.048957378 +-1.9466504 no longer -0.048957378 +-1.692859 upon ourselves -0.048957378 +-1.1708648 ask ourselves -0.048957378 +-2.3119407 be taught -0.04895735 +-3.3047535 a special -0.048957378 +-3.2979217 and colleagues -0.048957378 +-2.596728 to grow -0.13618872 +-2.1375635 up quicker -0.048957378 +-2.0331504 about glife -0.048957378 +-2.9977748 their kids -0.048957378 +-1.3916435 Most kids -0.048957378 +-1.94348 own kids -0.048957378 +-3.4204583 the correct -0.048957378 +-3.644728 Throughout -0.048957378 +-3.2979217 and grandparents -0.048957378 +-1.7715586 Many youngsters -0.048957378 +-2.401619 fs consumerist -0.048957378 +-1.9465656 own cell -0.048957378 +-0.6942867 cell phones -0.048957378 +-0.6942867 phones funded -0.048957378 +-3.1293185 in possession -0.048957378 +-0.995135 latest games -0.048957378 +-3.2799814 and clothes -0.048957378 +-0.4095183 fancy clothes -0.048957378 +-3.2908964 a wage -0.048957378 +-0.99501497 minimum wage -0.048957378 +-1.4712842 example communicating -0.048957378 +-2.8133025 time communicating -0.048957378 +-3.321535 the groups -0.048957378 +-2.349459 study groups -0.048957378 +-2.144826 social groups -0.048957378 +-1.6903851 age groups -0.048957378 +-2.1510062 social etiquette -0.048957378 +-1.5381981 business etiquette -0.048957378 +-0.995135 formal spoken -0.048957378 +-0.6942867 spoken language -0.048957378 +-2.1621325 I wish -0.13618872 +-2.274766 they wish -0.27955192 +-2.2860384 who wish -0.048957378 +-2.7516315 a teacher -0.048957378 +-3.3047535 a nursery -0.048957378 +-2.7186737 or cram -0.048957378 +-3.1293185 in tourism -0.048957378 +-2.532656 at hotels -0.048957378 +-1.5382957 local tour -0.048957378 +-0.6942867 tour guides -0.048957378 +-3.3624787 the weekends -0.048957378 +-2.3673859 The weekends -0.048957378 +-2.5259967 at weekends -0.048957378 +-2.1669266 very challenging -0.048957378 +-1.5961065 companies receiving -0.048957378 +-0.6942267 receiving thousands -0.048957378 +-0.6942267 kills thousands -0.048957378 +-3.4204583 the key -0.048957378 +-3.5096595 , leading -0.048957378 +-3.4650414 to join -0.048957378 +-2.7130423 of attending -0.048957378 +-2.3287637 when attending -0.048957378 +-1.731922 still attending -0.048957378 +-3.106568 in obtaining -0.048957378 +-2.1483922 world obtaining -0.048957378 +-1.8656976 without obtaining -0.048957378 +-3.2979217 and time-management -0.048957378 +-3.3047535 a semi-regular -0.048957378 +-2.7983925 work regimen -0.048957378 +-3.4204583 the oppressive -0.048957378 +-3.5096595 , thought-controlling -0.048957378 +-0.6942867 thought-controlling atmosphere -0.048957378 +-2.4782233 people accustomed -0.048957378 +-1.1710447 welcome shock -0.048957378 +-3.4204583 the cloistered -0.048957378 +-3.3047535 a fascinating -0.048957378 +-2.1511507 world exists -0.048957378 +-0.99501497 challenges exists -0.048957378 +-3.4204583 the recommended -0.048957378 +-0.6942867 recommended norms -0.048957378 +-3.28005 of contemporary -0.048957378 +-1.9447823 university thought -0.048957378 +-0.9947796 concentrated thought -0.048957378 +-1.1705118 critical thought -0.048957378 +-1.1710447 thought patterns -0.048957378 +-2.7713203 not apply -0.048957378 +-2.9060166 are mocked -0.048957378 +-2.7396104 as absurd -0.048957378 +-3.5096595 , occupying -0.048957378 +-0.6942867 occupying oneself -0.048957378 +-1.6929126 outside activity -0.048957378 +-1.2955703 extra-curricular activity -0.048957378 +-0.9947796 physical activity -0.048957378 +-2.7983925 work restricts -0.048957378 +-1.0614876 bad habits -0.048957378 +-2.1414988 social habits -0.048957378 +-1.293971 sleeping habits -0.048957378 +-1.3902439 healthy habits -0.048957378 +-0.9942178 recreational habits -0.048957378 +-1.4715303 habits deleterious -0.048957378 +-3.644728 Said -0.048957378 +-1.1710447 include extended -0.048957378 +-0.6942867 extended periods -0.048957378 +-0.995135 video game -0.048957378 +-0.6942867 game playing -0.048957378 +-3.5096595 , lounging -0.048957378 +-2.401619 fs sofa -0.048957378 +-0.6942867 sofa watching -0.048957378 +-0.6942867 watching daytime -0.048957378 +-0.6942867 daytime television -0.048957378 +-3.5096595 , indulging -0.048957378 +-3.1293185 in repeated -0.048957378 +-0.6942867 repeated bouts -0.048957378 +-3.28005 of binge -0.048957378 +-2.7309246 with equally -0.048957378 +-0.6942867 equally dissolute -0.048957378 +-2.469975 an obsessive -0.048957378 +-0.6942867 obsessive devotion -0.048957378 +-3.4163227 to online -0.048957378 +-2.0856535 learning online -0.048957378 +-0.69410884 Playing online -0.048957378 +-1.1710447 online fleshpots -0.048957378 +-0.6942867 fleshpots promising -0.048957378 +-3.2799814 and sexual -0.048957378 +-0.6942267 promising sexual -0.048957378 +-0.995135 sexual release -0.048957378 +-3.1293185 in exchange -0.048957378 +-3.1303647 is split -0.048957378 +-1.5961065 therefore behooves -0.048957378 +-3.4650414 to arrange -0.048957378 +-2.7713203 not bear -0.048957378 +-0.6942867 bear mentioning -0.048957378 +-3.09512 One -0.187397 +-2.1054032 any REAL -0.048957378 +-3.644728 Nor -0.048957378 +-2.8665538 the efforts -0.048957378 +-2.9977748 their efforts -0.048957378 +-0.69410884 anti-smoking efforts -0.048957378 +-2.0880563 good life-lesson -0.048957378 +-1.1710447 hopefully tie-in -0.048957378 +-2.7396104 as failure -0.048957378 +-1.9465656 own disciplines -0.048957378 +-3.4650414 to achieve -0.048957378 +-3.4204583 the greatest -0.048957378 +-0.6942867 greatest assets -0.048957378 +-1.5383809 obtain control -0.048957378 +-3.2799814 and desires -0.048957378 +-2.3987331 fs desires -0.048957378 +-2.4782233 people depend -0.048957378 +-1.8383849 right track -0.048957378 +-2.2890806 so spoiled -0.048957378 +-2.2313263 these spoiled -0.048957378 +-1.4704666 everything paid -0.048957378 +-1.4704666 already paid -0.048957378 +-1.2950919 approach paid -0.048957378 +-3.4204583 the much-needed -0.048957378 +-2.401619 fs self -0.048957378 +-2.45335 this transitional -0.048957378 +-3.2908964 a transitional -0.048957378 +-2.5982223 from dependence -0.048957378 +-2.7282374 I shall -0.048957378 +-2.2854536 we shall -0.048957378 +-0.69410884 reckoning shall -0.048957378 +-2.4028242 my claim -0.048957378 +-3.4204583 the domestic -0.048957378 +-0.6942867 domestic front -0.048957378 +-1.5382957 generally leaves -0.048957378 +-3.4204583 the security -0.048957378 +-1.8681672 family unit -0.048957378 +-3.246793 and begins -0.048957378 +-1.1868148 he begins -0.187397 +-1.2950919 schedule begins -0.048957378 +-3.4650414 to strike -0.048957378 +-3.3047535 a course-load -0.048957378 +-1.3922309 encourage thoughts -0.048957378 +-1.5378767 explore thoughts -0.048957378 +-3.28005 of pecuniary -0.048957378 +-3.2799814 and changing -0.048957378 +-2.0511177 his changing -0.048957378 +-2.8497334 they hobbies -0.048957378 +-1.8375617 your hobbies -0.048957378 +-3.4650414 to assert -0.048957378 +-3.28005 of starting -0.048957378 +-3.09512 Whether -0.048957378 +-2.7267487 will vary -0.048957378 +-3.5096595 , courting -0.048957378 +-3.4931862 , meeting -0.048957378 +-1.5378767 generally meeting -0.048957378 +-0.995135 meeting viable -0.048957378 +-2.7309246 with whom -0.048957378 +-3.644728 Independent -0.048957378 +-3.2676826 of supporting -0.048957378 +-1.837449 income supporting -0.048957378 +-2.222701 reasons stated -0.048957378 +-1.5382957 generally fresh -0.048957378 +-3.3047535 a firm -0.048957378 +-3.0452757 for superiors -0.048957378 +-2.0880563 good life-skill -0.048957378 +-0.6942867 life-skill enhancements -0.048957378 +-3.644728 Searching -0.048957378 +-2.7396104 as filling -0.048957378 +-2.7796392 job interview -0.048957378 +-3.4204583 the joys -0.048957378 +-3.4204583 the pain -0.048957378 +-1.868011 being rejected -0.048957378 +-3.1293185 in cooperation -0.048957378 +-3.2979217 and humble -0.048957378 +-2.2339225 help relieve -0.048957378 +-3.28005 of funding -0.048957378 +-3.120598 is involved -0.048957378 +-0.99501497 everyone involved -0.048957378 +-2.7413492 I emphatically -0.048957378 +-2.4128842 life style -0.048957378 +-2.8983755 are individuals -0.048957378 +-1.6920565 These individuals -0.048957378 +-2.1061108 into adulthood -0.048957378 +-2.469975 an indispensable -0.048957378 +-1.2958019 highly recommend -0.048957378 +-2.469975 an opportune -0.048957378 +-2.1071095 young men -0.048957378 +-2.7131412 or women -0.048957378 +-2.415659 more women -0.048957378 +-2.1375635 up THEIR -0.048957378 +-0.6942867 THEIR interpretation -0.048957378 +-3.3047535 a wonderful -0.048957378 +-2.2910626 what meaning -0.048957378 +-2.7659814 not applied -0.048957378 +-2.7131412 or applied -0.048957378 +-2.3793182 skills exactly -0.048957378 +-2.532656 at Dairy -0.048957378 +-0.6942867 Dairy Queens -0.048957378 +-3.644728 Well -0.048957378 +-3.2799814 and handling -0.048957378 +-2.5483203 money handling -0.048957378 +-1.9465656 own accounts -0.048957378 +-2.4573839 all add -0.048957378 +-2.7665153 it comes -0.048957378 +-1.6469738 freedom comes -0.048957378 +-2.5645084 is indeed -0.048957378 +-3.4931862 , indeed -0.048957378 +-2.9631343 , reduces -0.048957378 +-1.7714736 personal note -0.048957378 +-2.7747319 job throughout -0.048957378 +-1.1708648 off throughout -0.048957378 +-3.644728 Interestingly -0.048957378 +-3.5096595 , exercised -0.048957378 +-3.3624787 the greater -0.048957378 +-2.1171923 much greater -0.048957378 +-0.69410884 exercised greater -0.048957378 +-1.5962766 helps defray -0.048957378 +-3.4204583 the ever-increasing -0.048957378 +-3.644728 After -0.048957378 +-2.7747319 job creates -0.048957378 +-1.3923441 debt creates -0.048957378 +-3.2979217 and burdens -0.048957378 +-3.0111382 that newly -0.048957378 +-3.120598 is newly -0.048957378 +-0.995135 newly minted -0.048957378 +-3.644728 Reducing -0.048957378 +-1.8384697 cases eliminating -0.048957378 +-1.1710447 mental state -0.048957378 +-3.4000094 the population -0.048957378 +-1.1708648 greater population -0.048957378 +-3.1293185 in contact -0.048957378 +-2.4573839 all walks -0.048957378 +-3.4204583 the myriad -0.048957378 +-1.5961916 place cwhich -0.048957378 +-3.5096595 , isn -0.048957378 +-2.7413492 I couldn -0.048957378 +-1.5382957 quite shocked -0.048957378 +-2.7972748 have perfected -0.048957378 +-3.2979217 and begging -0.048957378 +-2.7659814 not outright -0.048957378 +-2.4667776 an outright -0.048957378 +-0.995135 outright demanding -0.048957378 +-2.8541157 they shell -0.048957378 +-2.0890174 out outrageous -0.048957378 +-1.295562 small amounts -0.048957378 +-0.6942267 outrageous amounts -0.048957378 +-2.7925994 have nothing -0.048957378 +-1.8948483 really nothing -0.048957378 +-1.8373363 : wrong -0.048957378 +-0.99501497 nothing wrong -0.048957378 +-2.7364948 on ski -0.048957378 +-0.6942867 ski trips -0.048957378 +-3.5096595 , traveling -0.048957378 +-3.2799814 and enjoying -0.048957378 +-3.2676826 of enjoying -0.048957378 +-3.0246768 for probably -0.048957378 +-3.4625692 , probably -0.048957378 +-2.410627 also probably -0.048957378 +-1.9225851 those dollars -0.048957378 +-2.9631343 , namely -0.048957378 +-2.7551584 and consequently -0.048957378 +-1.4711709 educational institutes -0.048957378 +-1.1708648 loan institutes -0.048957378 +-2.6468444 student household -0.048957378 +-2.7357929 as household -0.048957378 +-3.28005 of parental -0.048957378 +-0.6942867 parental accommodation -0.048957378 +-1.1710447 largely financed -0.048957378 +-2.7186737 or guardians -0.187397 +-0.995135 enormous sum -0.048957378 +-2.880715 be subsidized -0.048957378 +-3.2908964 a third -0.048957378 +-3.2799814 and third -0.048957378 +-1.1710447 rarely adequate -0.048957378 +-1.6926544 balance expenditure -0.048957378 +-2.7708824 it yields -0.048957378 +-3.2908964 a double -0.048957378 +-2.1384466 after double -0.048957378 +-0.995135 double bonus -0.048957378 +-3.4204583 the character-building -0.048957378 +-0.6942867 character-building aspect -0.048957378 +-2.7983925 work influences -0.048957378 +-3.0452757 for younger -0.048957378 +-0.995135 household earners -0.048957378 +-2.0337405 future professions -0.048957378 +-2.233546 in terms -0.27955192 +-3.5096595 , diligence -0.048957378 +-2.859086 be said -0.048957378 +-2.9982424 that said -0.048957378 +-1.2950919 That said -0.048957378 +-3.4000094 the beginning -0.048957378 +-3.2799814 and beginning -0.048957378 +-2.1194625 This relief -0.048957378 +-1.5956279 great relief -0.048957378 +-2.097305 -LRB- due -0.048957378 +-2.7404244 college due -0.048957378 +-0.9944986 relief due -0.048957378 +-0.69396824 unread due -0.048957378 +-1.5961065 been shown -0.048957378 +-2.880715 be traumatizing -0.048957378 +-2.7413492 I totally -0.048957378 +-3.1303647 is pretty -0.048957378 +-2.182114 has similarities -0.048957378 +-2.7143507 will essentially -0.048957378 +-3.246793 and essentially -0.048957378 +-2.2854536 we essentially -0.048957378 +-3.5096595 , sit -0.048957378 +-0.6942867 sit quietly -0.048957378 +-1.9698664 our feet -0.048957378 +-3.2979217 and spontaneous -0.048957378 +-0.6942867 spontaneous feedback -0.048957378 +-2.7186737 or aren -0.048957378 +-2.1210358 This dimension -0.048957378 +-2.920025 to perform -0.048957378 +-3.1303647 is somewhat -0.048957378 +-0.6942867 somewhat lacking -0.048957378 +-2.1047764 spend anyway -0.048957378 +-1.7337555 opinion anyway -0.048957378 +-2.2432292 only remember -0.048957378 +-1.7332089 still remember -0.048957378 +-2.052458 first paycheck -0.048957378 +-3.4204583 the impression -0.048957378 +-2.7708824 it gave -0.048957378 +-2.418582 also seemed -0.048957378 +-3.0178597 that peculiar -0.048957378 +-0.6942867 peculiar sort -0.048957378 +-2.4184146 more cautious -0.048957378 +-1.895859 less apt -0.048957378 +-3.6294703 Yet -0.048957378 +-0.99501497 c Yet -0.048957378 +-2.7413492 I guess -0.048957378 +-3.4000094 the primary -0.048957378 +-2.3729722 The primary -0.048957378 +-3.644728 Anything -0.048957378 +-1.4716156 club membership -0.048957378 +-3.1303647 is supplementary -0.048957378 +-3.644728 Extra -0.048957378 +-1.5382957 local fast-food -0.048957378 +-3.2979217 and hopes -0.048957378 +-3.3047535 a computer -0.048957378 +-0.6942867 computer programmer -0.048957378 +-3.2908964 a burger -0.048957378 +-2.1034224 then burger -0.048957378 +-0.995135 burger flipping -0.048957378 +-3.3047535 a decidedly -0.048957378 +-3.2085369 a negative -0.048957378 +-2.678043 or negative -0.048957378 +-1.5924584 potential negative -0.048957378 +-0.6938276 decidedly negative -0.048957378 +-0.9942178 negligible negative -0.048957378 +-3.3047535 a first-year -0.048957378 +-1.4715303 already stretched -0.048957378 +-0.6942867 stretched thin -0.048957378 +-3.28005 of college-level -0.048957378 +-3.120598 is negligible -0.048957378 +-1.1708648 almost negligible -0.048957378 +-2.880715 be fine -0.048957378 +-3.4650414 to take-up -0.048957378 +-2.7659814 not solely -0.048957378 +-1.1708648 left solely -0.048957378 +-2.9767401 students ' -0.048957378 +-1.9693077 restaurants ' -0.048957378 +-0.995135 ' discretion -0.048957378 +-3.4650414 to sharpen -0.048957378 +-3.4204583 the in-class -0.048957378 +-3.1303647 is supplemented -0.048957378 +-0.99501497 lab experiments -0.048957378 +-0.6942267 chemistry experiments -0.048957378 +-2.7186737 or casework -0.048957378 +-2.7665153 it virtually -0.048957378 +-2.8983755 are virtually -0.048957378 +-0.99501497 virtually impossible -0.048957378 +-0.99501497 near impossible -0.048957378 +-2.7516315 a heavy -0.048957378 +-3.4204583 the detriment -0.048957378 +-3.1215048 in physical -0.048957378 +-3.014355 their physical -0.048957378 +-3.018787 a health -0.048957378 +-2.0139756 the health -0.3183926 +-2.068815 any health -0.048957378 +-2.3659418 and health -0.13618872 +-3.1322563 to health -0.048957378 +-2.8239956 their health -0.048957378 +-2.016018 's health -0.048957378 +-1.5825555 potential health -0.048957378 +-1.7513353 personal health -0.048957378 +-1.917516 own health -0.048957378 +-1.3848785 negative health -0.048957378 +-0.99169886 physical health -0.048957378 +-0.6925645 proven health -0.048957378 +-0.6925645 improving health -0.048957378 +-3.644728 Everyone -0.048957378 +-2.1210358 This applies -0.048957378 +-2.7972748 have extracurricular -0.048957378 +-3.644728 What -0.048957378 +-3.5096595 , picking -0.048957378 +-2.7312138 a healthy -0.048957378 +-3.428628 , healthy -0.048957378 +-0.9944986 wants healthy -0.048957378 +-1.469627 few healthy -0.048957378 +-2.7396104 as devoting -0.048957378 +-3.4650414 to exercising -0.048957378 +-3.2799814 and recreational -0.048957378 +-3.014355 their recreational -0.048957378 +-3.0230536 their resumes -0.048957378 +-3.3047535 a well-rounded -0.048957378 +-2.9060166 are introduced -0.048957378 +-3.4204583 the tedious -0.048957378 +-2.5503032 in mind -0.048957378 +-2.9787707 their mind -0.048957378 +-2.3897333 my mind -0.048957378 +-1.3909432 healthy mind -0.048957378 +-2.1525254 most curious -0.048957378 +-1.8679262 new phase -0.048957378 +-2.880715 be encountering -0.048957378 +-2.8541157 they share -0.048957378 +-2.7396104 as swimming -0.048957378 +-2.7186737 or chemistry -0.048957378 +-1.8679262 new light -0.048957378 +-0.6942867 light bulb -0.048957378 +-2.880715 be cherished -0.048957378 +-2.722524 will miss -0.048957378 +-3.4479775 to miss -0.048957378 +-2.144826 social ideas -0.048957378 +-1.644695 main ideas -0.048957378 +-1.8633474 new ideas -0.048957378 +-0.69396824 sharing ideas -0.048957378 +-3.4204583 the academics -0.048957378 +-3.4204583 the dormitory -0.048957378 +-2.873293 be offered -0.048957378 +-1.4498333 activities offered -0.187397 +-1.8655307 being exposed -0.048957378 +-2.8837602 are exposed -0.048957378 +-2.3287637 when exposed -0.048957378 +-3.4650414 to group -0.048957378 +-3.0230536 their peers -0.048957378 +-3.644728 Living -0.048957378 +-3.2979217 and sharing -0.048957378 +-3.4000094 the dorm -0.048957378 +-1.6465397 same dorm -0.048957378 +-3.3047535 a permanent -0.048957378 +-0.6942867 permanent bond -0.048957378 +-1.1705118 fall behind -0.048957378 +-1.1705118 left behind -0.048957378 +-0.9947796 significantly behind -0.048957378 +-2.7665153 it won -0.048957378 +-2.8497334 they won -0.048957378 +-0.995135 won ` -0.048957378 +-0.6942867 ` t -0.048957378 +-3.4650414 to participate -0.048957378 +-3.0452757 for awareness -0.048957378 +-3.1303647 is worried -0.048957378 +-3.644728 Personally -0.048957378 +-2.401619 fs terribly -0.048957378 +-1.8680959 without starving -0.048957378 +-2.342011 would deem -0.048957378 +-3.2908964 a dollar -0.048957378 +-2.1031985 If dollar -0.048957378 +-3.644728 Don -0.048957378 +-3.3624787 the 40 -0.048957378 +-2.7024844 or 40 -0.048957378 +-1.4704666 around 40 -0.048957378 +-3.4650414 to 45 -0.048957378 +-3.014355 their exams -0.048957378 +-1.295562 pass exams -0.048957378 +-2.1524544 social clicks -0.048957378 +-0.995135 avoid drugs -0.048957378 +-2.1524544 social rejection -0.048957378 +-2.9060166 are thrown -0.048957378 +-2.561393 working dogs -0.048957378 +-3.4204583 the remainder -0.048957378 +-1.969951 other paths -0.048957378 +-3.0382125 for distracting -0.048957378 +-2.8983755 are distracting -0.048957378 +-1.8678414 was original -0.048957378 +-2.6507893 student decides -0.048957378 +-2.7267487 will deter -0.048957378 +-3.5096595 , he\/she -0.048957378 +-2.4573839 all energies -0.048957378 +-3.1303647 is conducive -0.048957378 +-1.6475859 makes decisions -0.048957378 +-2.7396104 as open -0.048957378 +-3.644728 Failure -0.048957378 +-2.1200047 much larger -0.048957378 +-0.99501497 grow larger -0.048957378 +-0.995135 larger possibility -0.048957378 +-3.4204583 the administration -0.048957378 +-3.1303647 is treating -0.048957378 +-3.644728 Consequently -0.048957378 +-2.7796392 job poses -0.048957378 +-3.3047535 a threat -0.048957378 +-3.09512 Money -0.187397 +-3.3047535 a distracter -0.048957378 +-2.9818435 students boils -0.048957378 +-1.3916435 hold down -0.048957378 +-0.69410884 boils down -0.048957378 +-1.1706803 holding down -0.048957378 +-2.0331504 about networking -0.048957378 +-3.2979217 and landing -0.048957378 +-3.1303647 is ideal -0.048957378 +-2.7437143 not properly -0.048957378 +-2.4973617 jobs properly -0.048957378 +-1.5938057 done properly -0.048957378 +-1.3909432 eat properly -0.048957378 +-3.644728 Training -0.048957378 +-3.4650414 to train -0.048957378 +-3.3047535 a temporary -0.048957378 +-3.4000094 the pursuit -0.048957378 +-2.0511177 his pursuit -0.048957378 +-3.09512 Any -0.048957378 +-2.2090015 studies lay -0.048957378 +-3.4650414 to forgo -0.048957378 +-1.2958019 him physically -0.048957378 +-0.995135 Any forms -0.048957378 +-3.2676826 of exhaustion -0.048957378 +-2.5949488 from exhaustion -0.048957378 +-0.995135 exhaustion interferes -0.048957378 +-3.28005 of offering -0.048957378 +-2.880715 be weighed -0.048957378 +-1.1710447 economic crisis -0.048957378 +-3.2908964 a middle -0.048957378 +-3.4931862 , middle -0.048957378 +-1.1710447 middle aged -0.048957378 +-0.995135 married couples -0.048957378 +-2.121458 even elderly -0.048957378 +-3.102048 is heavily -0.048957378 +-2.8837602 are heavily -0.048957378 +-0.69410884 profits heavily -0.048957378 +-3.5096595 , severely -0.048957378 +-2.2909184 so disparate -0.048957378 +-2.7708678 Currently -0.13618872 +-3.644728 Why -0.048957378 +-2.387195 do dropout -0.048957378 +-3.4650414 to losing -0.048957378 +-2.0881407 become trapped -0.048957378 +-3.4204583 the image -0.048957378 +-1.1710447 material goods -0.048957378 +-2.469975 an exciting -0.048957378 +-1.3923441 earned versus -0.048957378 +-0.99501497 disadvantage versus -0.048957378 +-3.4204583 the boring -0.048957378 +-3.28005 of uninteresting -0.048957378 +-3.644728 Communication -0.048957378 +-3.4204583 the media -0.048957378 +-3.3047535 a typical -0.048957378 +-3.2799814 and smaller -0.048957378 +-1.8676198 getting smaller -0.048957378 +-2.7044797 can possibly -0.048957378 +-3.2799814 and possibly -0.048957378 +-0.995135 possibly gather -0.048957378 +-2.0510054 often seek -0.048957378 +-3.4479775 to seek -0.048957378 +-2.3748312 many establishments -0.048957378 +-1.1709782 entertainment establishments -0.048957378 +-3.2799814 and bars -0.048957378 +-2.7357929 as bars -0.048957378 +-3.2799814 and nightclubs -0.048957378 +-2.7131412 or nightclubs -0.048957378 +-2.4667776 an obvious -0.048957378 +-2.1511004 most obvious -0.048957378 +-2.4184146 more attracted -0.048957378 +-3.4000094 the lucrative -0.048957378 +-2.165397 very lucrative -0.048957378 +-0.995135 lucrative tips -0.048957378 +-3.2908964 a Las -0.048957378 +-3.1215048 in Las -0.048957378 +-0.40954217 Las Vegas -0.048957378 +-0.995135 Vegas casino -0.048957378 +-2.880715 be mesmerized -0.048957378 +-3.28005 of winning -0.048957378 +-3.4204583 the tables -0.048957378 +-3.3047535 a nationwide -0.048957378 +-0.6942867 nationwide boom -0.048957378 +-2.726789 with Texas -0.048957378 +-3.4931862 , Texas -0.048957378 +-0.40954217 Texas Hold -0.187397 +-0.6942867 Hold eEm -0.187397 +-0.6942867 eEm Poker -0.048957378 +-3.1215048 in North -0.048957378 +-0.6942267 swept North -0.048957378 +-3.644728 Recently -0.048957378 +-3.2649243 a poker -0.048957378 +-1.645653 professional poker -0.048957378 +-1.1705118 online poker -0.048957378 +-2.1063418 young players -0.048957378 +-1.1708648 poker players -0.048957378 +-2.7516315 a diploma -0.048957378 +-2.165397 very popular -0.048957378 +-2.2890806 so popular -0.048957378 +-2.7972748 have evolved -0.048957378 +-2.9060166 are skilled -0.048957378 +-2.2803774 some luck -0.048957378 +-3.4000094 the prize -0.048957378 +-0.6942267 grand prize -0.048957378 +-1.8679262 new trend -0.048957378 +-2.182114 has swept -0.048957378 +-3.2979217 and continues -0.048957378 +-3.644728 Playing -0.048957378 +-1.8679262 new addictive -0.048957378 +-0.6942867 addictive drug -0.048957378 +-3.09512 You -0.187397 +-2.7972748 have access -0.048957378 +-0.995135 play 24 -0.048957378 +-3.4479775 to win -0.048957378 +-1.3922309 ultimately win -0.048957378 +-0.995135 win substantial -0.048957378 +-1.2958019 cash prizes -0.048957378 +-1.1710447 poker tournament -0.048957378 +-3.4204583 the grand -0.048957378 +-2.469975 an incomplete -0.048957378 +-3.644728 Assuming -0.048957378 +-2.920025 to worry -0.27955192 +-3.4650414 to invest -0.048957378 +-2.7267487 will respond -0.048957378 +-2.1219382 parents proud -0.048957378 +-2.880715 be plainly -0.048957378 +-2.3119407 be divided -0.048957378 +-3.644728 Troubles -0.048957378 +-3.4000094 the goals -0.048957378 +-0.6942267 term goals -0.048957378 +-1.538719 once established -0.048957378 +-2.3332694 when applying -0.048957378 +-1.4715303 My brother -0.048957378 +-1.295562 finally dropped -0.048957378 +-0.6942267 brother dropped -0.048957378 +-3.28005 of Northeastern -0.048957378 +-3.1293185 in Massachusetts -0.048957378 +-0.995135 double majoring -0.048957378 +-3.1293185 in physics -0.048957378 +-2.532656 at Disney -0.048957378 +-2.561393 working holiday -0.048957378 +-3.4204583 the chances -0.048957378 +-3.3624787 the partying -0.048957378 +-2.7283971 as partying -0.048957378 +-2.7024844 or partying -0.048957378 +-2.4184146 more serious -0.048957378 +-1.8381437 A budgeted -0.048957378 +-2.181358 could stem -0.048957378 +-1.1710447 partying funds -0.048957378 +-1.3926156 concentrate 100 -0.048957378 +-2.4562922 this distracts -0.048957378 +-2.7516315 a reasonable -0.048957378 +-2.7912447 the restaurants -0.048957378 +-2.02035 in restaurants -0.08422062 +-3.2391717 to restaurants -0.048957378 +-2.6790822 on restaurants -0.048957378 +-2.632936 or restaurants -0.048957378 +-1.9418701 at restaurants -0.048957378 +-1.5416769 all restaurants -0.27406517 +-1.3874582 non-smoking restaurants -0.048957378 +-0.6932658 includes restaurants -0.048957378 +-1.7722225 friends whenever -0.048957378 +-3.3047535 a vicious -0.048957378 +-2.562118 should postpone -0.048957378 +-3.4204583 the greal -0.048957378 +-2.387195 do housework -0.048957378 +-3.2979217 and chores -0.048957378 +-3.644728 Governments -0.048957378 +-3.2085369 a ban -0.048957378 +-2.8836398 to ban -0.27955192 +-0.9942178 outright ban -0.048957378 +-0.9942178 total ban -0.187397 +-2.1427004 smoking ban -0.048957378 +-2.2911956 who employ -0.048957378 +-3.644728 Advising -0.048957378 +-2.8541157 they possess -0.048957378 +-3.28005 of Mammon -0.048957378 +-2.401619 fs glittering -0.048957378 +-0.6942867 glittering bounty -0.048957378 +-1.5961065 both wrong-headed -0.048957378 +-3.2979217 and short-sighted -0.048957378 +-3.644728 Anyone -0.048957378 +-1.2958019 ever known -0.048957378 +-2.6507893 student appreciates -0.048957378 +-3.4204583 the acquisition -0.048957378 +-3.644728 Interrupting -0.048957378 +-2.4562922 this headlong -0.048957378 +-0.6942867 headlong charge -0.048957378 +-3.0452757 for enlightenment -0.048957378 +-3.28005 of demons -0.048957378 +-2.7477093 a man -0.048957378 +-2.1063418 young man -0.048957378 +-2.0523734 his bursary -0.048957378 +-2.104994 -LRB- Beware -0.048957378 +-2.2911956 who extol -0.048957378 +-3.644728 Their -0.048957378 +-3.28005 of reckoning -0.048957378 +-1.3925304 every waking -0.048957378 +-1.2958019 hour digesting -0.048957378 +-1.5961065 great dollops -0.048957378 +-3.28005 of nourishing -0.048957378 +-0.6942867 nourishing scholarship -0.048957378 +-0.6942867 scholarship fed -0.048957378 +-0.995135 teaching assistants -0.048957378 +-0.995135 newly coined -0.048957378 +-2.880715 be dispensed -0.048957378 +-3.2979217 and mastered -0.048957378 +-0.995135 dollar signs -0.048957378 +-2.9060166 are dancing -0.048957378 +-2.3029737 if raw -0.048957378 +-0.6942867 raw cupidity -0.048957378 +-0.6942867 cupidity corrupts -0.048957378 +-2.0523734 his soul -0.048957378 +-2.3029737 if contemplating -0.048957378 +-0.6942867 contemplating materialist -0.048957378 +-0.6942867 materialist purchases -0.048957378 +-0.6942867 purchases disturbs -0.048957378 +-3.644728 Disrupting -0.048957378 +-2.4562922 this evolving -0.048957378 +-0.6942867 evolving relationship -0.048957378 +-2.4199624 by saddling -0.048957378 +-3.4204583 the questing -0.048957378 +-3.2908964 a minimum -0.048957378 +-3.4931862 , minimum -0.048957378 +-0.995135 wage drudgery -0.048957378 +-2.4199624 by interfering -0.048957378 +-3.4204583 the ceaseless -0.048957378 +-0.6942867 ceaseless cogitation -0.048957378 +-2.291517 one embraces -0.048957378 +-2.469975 an obscene -0.048957378 +-0.6942867 obscene perversion -0.048957378 +-3.1303647 is holy -0.048957378 +-3.644728 Never -0.048957378 +-3.644728 Again -0.048957378 +-3.4204583 the carrels -0.048957378 +-3.0452757 for undivided -0.048957378 +-1.9211627 academic attention -0.048957378 +-0.6942267 undivided attention -0.048957378 +-2.2090015 studies across -0.048957378 +-1.8679262 new barricades -0.048957378 +-0.995135 so-called gGreat -0.048957378 +-0.6942867 gGreat Works -0.048957378 +-1.295562 h \ -0.048957378 +-0.6942267 misogyny \ -0.048957378 +-0.995135 \ rightly -0.048957378 +-0.6942867 rightly unread -0.048957378 +-3.0230536 their relentless -0.048957378 +-0.6942867 relentless racism -0.048957378 +-3.2979217 and misogyny -0.048957378 +-1.9698664 our cobblestones -0.048957378 +-3.5096595 , unite -0.048957378 +-1.1710447 essentially shape -0.048957378 +-3.2979217 and passing -0.048957378 +-0.6942867 passing examinations -0.048957378 +-0.995135 convenience stores -0.048957378 +-1.1710447 stay awake -0.048957378 +-2.4510674 them failing -0.048957378 +-1.868011 being asked -0.048957378 +-3.0178597 that obstructs -0.048957378 +-2.880715 be discouraged -0.048957378 +-3.3047535 a tool -0.048957378 +-3.0230536 their sons -0.048957378 +-2.7186737 or daughters -0.048957378 +-3.0230536 their dreams -0.048957378 +-0.6942867 dreams dashed -0.048957378 +-2.7364948 on behalf -0.048957378 +-2.24509 only hurting -0.048957378 +-2.418582 also destroying -0.048957378 +-3.1293185 in low-skilled -0.048957378 +-3.4650414 to flip -0.048957378 +-1.5384661 importance whatsoever -0.048957378 +-2.7713203 not pertain -0.048957378 +-3.4000094 the U.S. -0.048957378 +-2.1511004 most U.S. -0.048957378 +-2.7267487 will voice -0.048957378 +-3.1303647 is admitted -0.048957378 +-3.644728 Conversely -0.048957378 +-0.6942867 Conversely speaking -0.048957378 +-3.4204583 the United -0.048957378 +-0.6942867 United States -0.048957378 +-2.8176763 time consuming -0.048957378 +-2.8176763 time commuting -0.048957378 +-3.120598 is merely -0.048957378 +-2.0508933 than merely -0.048957378 +-3.4204583 the obligatory -0.048957378 +-3.4650414 to sustain -0.048957378 +-1.3925304 healthy existence -0.048957378 +-2.7143507 will eat -0.048957378 +-2.5898018 to eat -0.04895735 +-2.4699326 people eat -0.048957378 +-2.7357929 as opposed -0.048957378 +-1.5378767 am opposed -0.048957378 +-3.1303647 is taken-on -0.048957378 +-2.880715 be compromised -0.048957378 +-2.7309246 with excellence -0.048957378 +-3.0382125 for finishing -0.048957378 +-0.99501497 merely finishing -0.048957378 +-0.995135 finishing sake -0.048957378 +-2.880715 be burdened -0.048957378 +-3.28005 of fiscal -0.048957378 +-2.447429 that none -0.048957378 +-2.562118 should assume -0.048957378 +-3.4204583 the academically -0.048957378 +-0.6942867 academically minded -0.048957378 +-1.8381437 A high-school -0.048957378 +-1.5382957 generally focuses -0.048957378 +-3.28005 of mastering -0.048957378 +-3.4204583 the foremost -0.048957378 +-1.5382957 explore concepts -0.048957378 +-1.1710447 greater detail -0.048957378 +-3.1215048 in depth -0.048957378 +-3.2799814 and depth -0.048957378 +-3.2979217 and analysis -0.048957378 +-3.4000094 the mundane -0.048957378 +-2.5949488 from mundane -0.048957378 +-2.9982424 that holding -0.048957378 +-3.244391 of holding -0.048957378 +-1.8042351 days holding -0.048957378 +-2.2432292 only distract -0.048957378 +-1.5378767 simply distract -0.048957378 +-3.2979217 and hamstring -0.048957378 +-0.995135 overall efficacy -0.048957378 +-1.1710447 developing himself -0.048957378 +-3.5096595 , confident -0.048957378 +-3.644728 His -0.048957378 +-2.880715 be plenty -0.187397 +-2.7267487 will negatively -0.048957378 +-1.5382957 long term -0.048957378 +-1.3923441 times past -0.048957378 +-1.5378767 long past -0.048957378 +-1.9698664 our father -0.048957378 +-3.2979217 and grandfather -0.048957378 +-3.4650414 to supply -0.048957378 +-3.3047535 a modest -0.048957378 +-3.2676826 of five -0.048957378 +-3.4479775 to five -0.048957378 +-2.0523734 his wife -0.048957378 +-3.2979217 and carrying -0.048957378 +-3.4204583 the entrance -0.048957378 +-3.4204583 the work-force -0.048957378 +-0.6942867 work-force coupled -0.048957378 +-2.0867398 become near -0.048957378 +-0.6942267 graduated near -0.048957378 +-3.4650414 to adequately -0.048957378 +-3.3047535 a four-year -0.048957378 +-2.7186737 or woman -0.048957378 +-1.6470779 fully dedicating -0.048957378 +-3.644728 Achieving -0.048957378 +-1.8388427 high marks -0.048957378 +-2.8176763 time devoted -0.048957378 +-2.880715 be safe-guarded -0.048957378 +-3.4204583 the maximum -0.048957378 +-2.7713203 not diluted -0.048957378 +-2.726789 with worries -0.048957378 +-0.99501497 mundane worries -0.048957378 +-1.1710447 schedules associated -0.048957378 +-3.2979217 and hold-down -0.048957378 +-3.644728 Let -0.048957378 +-3.4204583 the ultimate -0.048957378 +-1.734651 against anybody -0.048957378 +-1.5961065 great advances -0.048957378 +-2.0867398 become distracted -0.048957378 +-2.2787373 get distracted -0.048957378 +-2.7267487 will falter -0.048957378 +-3.0230536 their schoolwork -0.048957378 +-3.5096595 , standing -0.048957378 +-3.3047535 a hurry -0.048957378 +-2.880715 be pressured -0.048957378 +-3.2979217 and running -0.048957378 +-1.392954 becoming overwhelmed -0.048957378 +-1.647248 financially strapped -0.048957378 +-1.3925304 his\/her reliance -0.048957378 +-2.342011 would behoove -0.048957378 +-3.5096595 , valid -0.048957378 +-0.6942867 valid counter-argument -0.048957378 +-3.28005 of paramount -0.048957378 +-3.3047535 a doctor -0.048957378 +-2.708228 can tolerate -0.048957378 +-3.4650414 to repay -0.048957378 +-2.7413492 I seriously -0.048957378 +-0.6942867 seriously jeopardize -0.048957378 +-1.7715586 To coin -0.048957378 +-3.1293185 in vogue -0.048957378 +-2.3729722 The expression -0.048957378 +-0.6942267 vogue expression -0.048957378 +-3.5096595 , gI -0.048957378 +-1.5382957 am robbing -0.048957378 +-0.6942867 robbing Peter -0.048957378 +-1.868529 pay Paul -0.048957378 +-0.6942867 Paul h. -0.048957378 +-2.7567537 college rests -0.048957378 +-2.3758469 The onus -0.048957378 +-2.7396104 as innumerable -0.048957378 +-3.2979217 and enlightened -0.048957378 +-1.8678414 was 18 -0.048957378 +-2.7972748 have liked -0.048957378 +-3.120598 is nonetheless -0.048957378 +-3.4931862 , nonetheless -0.048957378 +-2.0264761 better approach -0.048957378 +-2.1127527 This approach -0.048957378 +-0.69396824 opposite approach -0.048957378 +-0.69396824 rational approach -0.048957378 +-2.7413492 I graduated -0.048957378 +-3.4204583 the bills -0.048957378 +-3.3047535 a smooth -0.048957378 +-1.5938057 learned here -0.048957378 +-1.5935729 true here -0.048957378 +-1.536077 problem here -0.048957378 +-0.69396824 examined here -0.048957378 +-1.4715303 already started -0.048957378 +-2.5483203 money aside -0.048957378 +-0.6942267 Putting aside -0.048957378 +-3.28005 of etime -0.048957378 +-2.3332694 when workloads -0.048957378 +-3.1293185 in volume -0.048957378 +-3.3047535 a shift -0.048957378 +-0.6942867 shift afterwards -0.048957378 +-0.995135 expression ehealthy -0.048957378 +-0.6942867 ehealthy body -0.048957378 +-1.5961065 f rings -0.048957378 +-2.9060166 are fatigued -0.048957378 +-2.5982223 from over-exertion -0.048957378 +-3.4204583 the weekend -0.048957378 +-2.7972748 have availability -0.048957378 +-2.4562922 this negates -0.048957378 +-3.3047535 a 5-day -0.048957378 +-3.3047535 a 6 -0.048957378 +-3.1303647 is compounded -0.048957378 +-3.0178597 that single -0.048957378 +-3.0452757 for relaxation -0.048957378 +-2.880715 be held -0.048957378 +-1.1710447 rarely relied -0.048957378 +-1.2958019 anyone else -0.048957378 +-3.4650414 to utilize -0.048957378 +-3.644728 Obliging -0.048957378 +-3.3047535 a detrimental -0.048957378 +-3.0230536 their outlook -0.048957378 +-3.4650414 to feelings -0.048957378 +-3.5096595 , depression -0.048957378 +-2.121458 even pessimism -0.048957378 +-2.880715 be honest -0.048957378 +-2.7413492 I hadn -0.048957378 +-3.09512 Indeed -0.187397 +-3.0230536 their perspective -0.048957378 +-3.644728 Before -0.048957378 +-2.561393 working 30 -0.048957378 +-3.2979217 and hardly -0.048957378 +-3.4650414 to catch -0.048957378 +-2.7413492 I ended -0.048957378 +-2.7364948 on cars -0.048957378 +-3.4931862 , fancy -0.048957378 +-2.4003265 my fancy -0.048957378 +-3.5096595 , cigarettes -0.048957378 +-3.5096595 , beer -0.048957378 +-1.5963482 did drop -0.048957378 +-2.7413492 I interviewed -0.048957378 +-2.7309246 with cared -0.048957378 +-3.3047535 a bartender -0.048957378 +-1.1710447 clothes impress -0.048957378 +-3.5096595 , anti-smoking -0.048957378 +-1.1710447 currently lag -0.048957378 +-2.3401537 would significantly -0.048957378 +-0.6942267 lag significantly -0.048957378 +-1.3925304 developed nations -0.048957378 +-1.2958019 further progress -0.048957378 +-2.7477093 a total -0.187397 +-3.4000094 the total -0.187397 +-2.36855 that smoking -0.20946135 +-2.3460815 a smoking -0.13618872 +-3.0569723 , smoking -0.048957378 +-2.90498 of smoking -0.048957378 +-2.7306316 their smoking -0.048957378 +-1.8324649 course smoking -0.048957378 +-1.7745919 restaurant smoking -0.048957378 +-2.1157014 on smoking -0.11268411 +-1.9264975 themselves smoking -0.048957378 +-1.9336168 believe smoking -0.048957378 +-1.5384129 up smoking -0.048957378 +-1.5188298 quit smoking -0.048957378 +-1.7386848 always smoking -0.048957378 +-1.1625811 regular smoking -0.048957378 +-1.52019 banned smoking -0.048957378 +-1.4616349 ban smoking -0.13618872 +-0.2717932 banning smoking -0.3855526 +-0.69144475 passive smoking -0.048957378 +-0.69144475 Banning smoking -0.048957378 +-0.69144475 discourage smoking -0.048957378 +-0.69144475 introduce smoking -0.048957378 +-0.69144475 segregate smoking -0.048957378 +-2.342011 would undoubtedly -0.048957378 +-1.1710447 considerable resistance -0.048957378 +-1.9456341 no restrictions -0.048957378 +-0.6942267 partial restrictions -0.048957378 +-3.644728 Implementing -0.048957378 +-2.342011 would represent -0.048957378 +-1.5961916 complete opposite -0.048957378 +-3.644728 Between -0.048957378 +-2.233106 these polar -0.048957378 +-0.6942867 polar extremes -0.048957378 +-1.1710447 middle route -0.048957378 +-1.3925304 certainly advisable -0.048957378 +-3.4650414 to urge -0.048957378 +-3.063924 the smoke -0.048957378 +-3.033918 of smoke -0.048957378 +-1.9268204 to smoke -0.14477092 +-1.6973147 who smoke -0.048957378 +-1.7168427 still smoke -0.048957378 +-1.5859333 hand smoke -0.048957378 +-0.40891302 breathing smoke -0.048957378 +-0.991978 inhaling smoke -0.048957378 +-0.25396264 second-hand smoke -0.04895735 +-1.1663197 tobacco smoke -0.048957378 +-0.6927047 Tobacco smoke -0.13618872 +-0.6927047 breathe smoke -0.048957378 +-0.6927047 secondhand smoke -0.048957378 +-3.4931862 , regardless -0.048957378 +-2.165491 but regardless -0.048957378 +-3.2979217 and well-being -0.048957378 +-3.1293185 in considering -0.048957378 +-2.5599043 the rights -0.04895735 +-3.014355 their rights -0.048957378 +-1.9223331 health concerns -0.048957378 +-2.3701181 many places -0.048957378 +-2.8837602 are places -0.048957378 +-0.80713165 public places -0.048957378 +-3.5096595 , partial -0.048957378 +-2.4612203 for banning -0.187397 +-3.3971481 , banning -0.048957378 +-2.4029973 by banning -0.048957378 +-2.6973925 of banning -0.187397 +-1.4687891 completely banning -0.048957378 +-1.5961065 been implemented -0.048957378 +-3.09512 Restaurants -0.048957378 +-2.880715 be legally -0.048957378 +-0.6942867 legally compelled -0.048957378 +-3.2908964 a separate -0.048957378 +-1.5964313 two separate -0.048957378 +-3.5096595 , ventilated -0.048957378 +-0.6942867 ventilated room -0.048957378 +-1.2218008 restaurant patrons -0.048957378 +-2.1514692 smoking patrons -0.048957378 +-2.7186737 or alternatively -0.048957378 +-2.1964433 such renovations -0.048957378 +-3.246793 and non-smoking -0.048957378 +-2.7130423 of non-smoking -0.048957378 +-1.6454853 fully non-smoking -0.048957378 +-1.3925304 non-smoking establishment -0.048957378 +-1.4715303 already built -0.048957378 +-2.1964433 such facilities -0.048957378 +-1.2958019 current lax -0.048957378 +-0.6942867 lax standards -0.048957378 +-0.995135 nice meal -0.048957378 +-1.8680959 without exposing -0.048957378 +-1.868348 getting cancer -0.048957378 +-3.5096595 , asthma -0.048957378 +-1.969951 other respiratory -0.048957378 +-3.4204583 the worst -0.048957378 +-0.6942867 worst sufferers -0.048957378 +-3.4000094 the ill -0.048957378 +-1.5378767 quite ill -0.048957378 +-3.321535 the effects -0.048957378 +-1.9178393 health effects -0.048957378 +-0.9944986 ill effects -0.048957378 +-0.69396824 harmful effects -0.048957378 +-3.09512 Smoking -0.048957378 +-0.995135 Smoking alone -0.048957378 +-0.6942867 alone kills -0.048957378 +-1.6475859 makes hundreds -0.048957378 +-2.4510674 them sick -0.048957378 +-2.0720463 because nowadays -0.048957378 +-1.8672669 like everywhere -0.048957378 +-1.3923441 law everywhere -0.048957378 +-3.644728 Inside -0.048957378 +-3.4931862 , breathing -0.048957378 +-1.295562 - breathing -0.048957378 +-3.28005 of appetite -0.048957378 +-2.7708824 it smells -0.048957378 +-2.2803774 some harm -0.048957378 +-3.644728 Nothing -0.048957378 +-2.708228 can justify -0.048957378 +-3.4000094 the interference -0.048957378 +-0.6942267 justify interference -0.048957378 +-2.7665153 it justified -0.048957378 +-0.6942267 perfectly justified -0.048957378 +-2.7044797 can affect -0.048957378 +-1.5379899 does affect -0.048957378 +-3.3047535 a moral -0.048957378 +-0.99501497 reasonable justification -0.048957378 +-0.6942267 moral justification -0.048957378 +-3.4204583 the @ -0.048957378 +-3.2979217 and guide -0.048957378 +-2.7024844 or choices -0.048957378 +-1.8362309 right choices -0.048957378 +-0.9947796 wise choices -0.048957378 +-3.4204583 the warnings -0.048957378 +-3.4204583 the campaigns -0.048957378 +-2.9060166 are perfectly -0.048957378 +-3.1303647 is passive -0.048957378 +-2.8983755 are inhaling -0.048957378 +-1.8672669 like inhaling -0.048957378 +-3.5096595 , unknowingly -0.048957378 +-2.9060166 are affecting -0.048957378 +-3.644728 Banning -0.048957378 +-3.4204583 the premise -0.048957378 +-3.3047535 a rational -0.048957378 +-3.4204583 the employee -0.048957378 +-1.8680823 like UK -0.048957378 +-3.2979217 and parts -0.048957378 +-3.28005 of US -0.048957378 +-1.9223331 health conscious -0.048957378 +-1.3925304 healthy citizens -0.048957378 +-1.9702765 restaurants despite -0.048957378 +-1.5385376 means infringement -0.048957378 +-2.9982424 that second-hand -0.048957378 +-3.244391 of second-hand -0.048957378 +-3.4163227 to second-hand -0.048957378 +-3.428628 , non-smokers -0.048957378 +-3.21812 of non-smokers -0.048957378 +-2.7177727 on non-smokers -0.048957378 +-2.8669279 are non-smokers -0.048957378 +-3.644728 Family -0.048957378 +-2.4184146 more prone -0.048957378 +-3.0090466 for smokers -0.048957378 +-2.842708 be smokers -0.048957378 +-3.321535 the smokers -0.048957378 +-2.7051468 of smokers -0.13618872 +-3.644728 Keeping -0.048957378 +-2.7516315 a smoke-free -0.048957378 +-2.7267487 will occur -0.048957378 +-3.644728 Putting -0.048957378 +-3.2979217 and potentially -0.048957378 +-3.28005 of unwanted -0.048957378 +-0.6942267 unwanted exposure -0.048957378 +-0.6942267 Regular exposure -0.048957378 +-2.9982424 that tobacco -0.048957378 +-3.4163227 to tobacco -0.048957378 +-2.1488628 smoking tobacco -0.048957378 +-3.4204583 the air -0.048957378 +-3.3047535 a definite -0.048957378 +-3.1303647 is served -0.048957378 +-2.3758469 The smell -0.048957378 +-1.2958019 becomes entangled -0.048957378 +-3.4650414 to tell -0.048957378 +-1.8957742 really tastes -0.048957378 +-1.5379899 banned altogether -0.048957378 +-1.9693077 restaurants altogether -0.048957378 +-3.644728 Smokers -0.048957378 +-3.4204583 the welfare -0.048957378 +-3.0178597 that includes -0.048957378 +-2.7708678 Tobacco -0.27955192 +-3.2908964 a poison -0.048957378 +-0.6942267 habit-forming poison -0.048957378 +-1.868011 being anywhere -0.187397 +-0.6942867 anywhere close -0.187397 +-2.0529363 smoke permeates -0.048957378 +-0.6942867 permeates foods -0.048957378 +-2.4782233 people breathe -0.048957378 +-2.7708824 it alters -0.048957378 +-3.3047535 a habit-forming -0.048957378 +-3.3047535 a stimulant -0.048957378 +-2.4782233 people react -0.048957378 +-0.6942867 react violently -0.048957378 +-3.644728 Customers -0.048957378 +-2.7972748 have heard -0.048957378 +-3.2908964 a smoky -0.187397 +-1.9222652 too smoky -0.048957378 +-2.7713203 not fair -0.048957378 +-3.644728 Restaurant -0.048957378 +-3.644728 Regular -0.048957378 +-1.9924572 through withdrawal -0.048957378 +-0.6942867 withdrawal pains -0.048957378 +-2.0881407 become addicted -0.048957378 +-3.644728 Exposing -0.048957378 +-2.469975 an insult -0.048957378 +-3.3047535 a safety -0.048957378 +-3.5096595 , selfish -0.048957378 +-3.0452757 for implementing -0.048957378 +-2.880715 be examined -0.048957378 +-3.0178597 that concerning -0.048957378 +-3.644728 Principally -0.048957378 +-3.4000094 the wise -0.048957378 +-0.99501497 similarly wise -0.048957378 +-2.3099248 be forced -0.187397 +-2.8983755 are forced -0.048957378 +-3.4650414 to compromise -0.048957378 +-1.1710447 choices regarding -0.048957378 +-1.9465656 own lifestyles -0.048957378 +-3.4650414 to discourage -0.048957378 +-2.1525254 most importantly -0.048957378 +-2.1061108 into account -0.048957378 +-3.3047535 a smoke-filled -0.048957378 +-2.880715 be ignored -0.048957378 +-3.5096595 , long-term -0.048957378 +-3.4204583 the national -0.048957378 +-0.6942867 national healthcare -0.048957378 +-0.995135 seek treatment -0.048957378 +-3.0452757 for tobacco-related -0.048957378 +-0.6942867 tobacco-related illnesses -0.048957378 +-2.0522041 's context -0.048957378 +-1.9222482 ; firstly -0.048957378 +-3.4650414 to promote -0.048957378 +-2.8842342 the comfort -0.048957378 +-1.3925304 non-smoking diners -0.048957378 +-0.995135 Despite proven -0.048957378 +-0.995135 popular amongst -0.048957378 +-2.4573839 all ages -0.048957378 +-2.1524544 social pastime -0.048957378 +-2.0881407 become deeply -0.048957378 +-0.6942867 deeply integrated -0.048957378 +-3.28005 of passively -0.048957378 +-0.6942867 passively contracting -0.048957378 +-1.7351952 related illness -0.048957378 +-1.9229224 too severe -0.048957378 +-3.0230536 their profits -0.048957378 +-3.4650414 to introduce -0.048957378 +-1.3925304 non-smoking sections -0.048957378 +-3.4650414 to segregate -0.048957378 +-2.208895 which suits -0.048957378 +-3.4650414 to succeed -0.048957378 +-2.0331504 about gradually -0.048957378 +-2.4199624 by improving -0.048957378 +-3.4204583 the clearly -0.048957378 +-0.6942867 clearly harmful -0.048957378 +-3.28005 of secondhand -0.048957378 + +\3-grams: +-0.0011299675 . +-0.0011299675 disagree . +-0.0003983347 this . +-0.00049796904 statement . +-0.0006641202 for . +-0.00014224081 reasons . +-0.00019916052 it . +-0.0011299675 part-time . +-0.000051024243 job . +-0.0006641202 is . +-0.00024893903 student . +-0.00022127786 graduation . +-0.0011299675 not . +-0.0006641202 case . +-0.00024893903 employment . +-0.00082123175 in . +-0.0003983347 field . +-0.0011299675 to . +-0.00014224081 study . +-0.0002845023 studying . +-0.0011299675 engineering . +-0.00019916052 experience . +-0.0011299675 from . +-0.00033193317 working . +-0.0006641202 restaurant . +-0.0011299675 there . +-0.00012445169 students . +-0.00082123175 stress . +-0.00082123175 on . +-0.00014224081 studies . +-0.00019916052 time . +-0.00009480477 jobs . +-0.0011299675 one . +-0.0011299675 tired . +-0.0011299675 focus . +-0.00013276354 school . +-0.00012445169 work . +-0.0011299675 produced . +-0.0011299675 such . +-0.0003983347 -RRB- . +-0.0011299675 pursuing . +-0.0011299675 should . +-0.0011299675 necessary . +-0.00010479905 college . +-0.0011299675 have . +-0.00011062484 money . +-0.0011299675 say . +-0.0011299675 you . +-0.0011299675 some . +-0.00082123175 bad . +-0.0011299675 begin . +-0.0011299675 today . +-0.00082123175 degree . +-0.00024893903 well . +-0.0011299675 are . +-0.00033193317 career . +-0.0006641202 experiences . +-0.0011299675 offer . +-0.00082123175 benefit . +-0.00024893903 future . +-0.0011299675 consider . +-0.00082123175 skill . +-0.0011299675 academic . +-0.00082123175 more . +-0.0011299675 employers . +-0.0011299675 better . +-0.0011299675 organization . +-0.0011299675 industry . +-0.00022127786 skills . +-0.00033193317 important . +-0.0011299675 employer . +-0.0006641202 management . +-0.0011299675 taking . +-0.0011299675 positively . +-0.0011299675 through . +-0.0011299675 productive . +-0.0006641202 themselves . +-0.0011299675 environment . +-0.0006641202 position . +-0.0011299675 abilities . +-0.0011299675 same . +-0.00082123175 financially . +-0.0006641202 issue . +-0.0011299675 enough . +-0.0011299675 food . +-0.00082123175 times . +-0.00082123175 need . +-0.0011299675 emergency . +-0.00082123175 earned . +-0.0011299675 good . +-0.00082123175 idea . +-0.00082123175 others . +-0.0006641202 like . +-0.0003983347 parents . +-0.00082123175 friends . +-0.00082123175 relationships . +-0.0003983347 people . +-0.0011299675 true . +-0.0011299675 College . +-0.0011299675 stable . +-0.0011299675 do . +-0.0011299675 something . +-0.0011299675 advantage . +-0.0011299675 high . +-0.0011299675 classes . +-0.0011299675 know . +-0.0006641202 ft. . +-0.0011299675 all . +-0.00033193317 them . +-0.0011299675 futures . +-0.0002845023 society . +-0.0011299675 specialized . +-0.0011299675 contexts . +-0.0011299675 specialty . +-0.0011299675 itself . +-0.0011299675 specialization . +-0.0011299675 fields . +-0.00082123175 temptation . +-0.00082123175 activities . +-0.0011299675 socialize . +-0.0011299675 results . +-0.0011299675 teachers . +-0.0011299675 demands . +-0.00011062484 life . +-0.00082123175 workforce . +-0.0006641202 responsibilities . +-0.0006641202 university . +-0.0011299675 up . +-0.0011299675 sleeping . +-0.0011299675 repercussions . +-0.0011299675 difficult . +-0.0011299675 lifestyle . +-0.00082123175 helpful . +-0.0011299675 regard . +-0.00082123175 finances . +-0.0011299675 responsibly . +-0.0011299675 cards . +-0.0011299675 loans . +-0.00082123175 etc. . +-0.0011299675 peril . +-0.00082123175 income . +-0.0011299675 debts . +-0.00049796904 expenses . +-0.0003983347 family . +-0.0011299675 adult . +-0.00082123175 development . +-0.0011299675 independent . +-0.00049796904 graduate . +-0.0011299675 professional . +-0.0006641202 customers . +-0.0011299675 responsible . +-0.00082123175 achievement . +-0.0011299675 necessity . +-0.0011299675 sense . +-0.00082123175 force . +-0.0011299675 staff . +-0.00082123175 situation . +-0.00082123175 year . +-0.00012445169 education . +-0.00033193317 world . +-0.00082123175 success . +-0.0011299675 entirely . +-0.00082123175 importance . +-0.0011299675 own . +-0.0011299675 pockets . +-0.0011299675 transition . +-0.0006641202 campus . +-0.0011299675 cultivated . +-0.0011299675 without . +-0.0011299675 about . +-0.0011299675 product . +-0.0011299675 limited . +-0.0011299675 assignments . +-0.0011299675 either . +-0.00049796904 too . +-0.0011299675 complicated . +-0.0006641202 business . +-0.0011299675 ethic . +-0.0011299675 learn . +-0.00082123175 lessons . +-0.0011299675 exhausting . +-0.0011299675 straight . +-0.0011299675 quit . +-0.00049796904 beneficial . +-0.0011299675 possible . +-0.00082123175 years . +-0.0011299675 medicine . +-0.0011299675 three . +-0.0011299675 lesson . +-0.0011299675 before . +-0.0011299675 organizations . +-0.0011299675 gates . +-0.0011299675 meet . +-0.0011299675 new . +-0.0011299675 against . +-0.0011299675 load . +-0.00082123175 performance . +-0.0011299675 books . +-0.0011299675 hindrance . +-0.00082123175 worked . +-0.0011299675 workload . +-0.00082123175 needs . +-0.0006641202 costs . +-0.0011299675 afford . +-0.00082123175 process . +-0.0011299675 system . +-0.00082123175 practice . +-0.00049796904 adults . +-0.0011299675 counterparts . +-0.0011299675 problem . +-0.0011299675 since . +-0.0011299675 concerned . +-0.0011299675 sophisticated . +-0.0011299675 effect . +-0.00049796904 workplace . +-0.0011299675 instead . +-0.0011299675 material . +-0.0011299675 concern . +-0.0011299675 worker . +-0.0006641202 problems . +-0.0011299675 families . +-0.0011299675 began . +-0.0011299675 far . +-0.0011299675 immersed . +-0.0011299675 rest . +-0.0006641202 lives . +-0.00082123175 want . +-0.0003983347 things . +-0.0011299675 drinks . +-0.0011299675 harder . +-0.0011299675 suffer . +-0.00082123175 fun . +-0.0011299675 tasks . +-0.0011299675 fatal . +-0.0003983347 Japan . +-0.00082123175 America . +-0.0006641202 freedom . +-0.0011299675 think . +-0.0011299675 anything . +-0.0011299675 why . +-0.0011299675 attend . +-0.0006641202 wisely . +-0.0011299675 commodity . +-0.0011299675 hours . +-0.0011299675 5 . +-0.0006641202 days . +-0.00082123175 week . +-0.0011299675 easily . +-0.00049796904 day . +-0.0011299675 coworkers . +-0.0011299675 underway . +-0.0011299675 home . +-0.0006641202 responsibility . +-0.0011299675 homework . +-0.0011299675 completed . +-0.0011299675 lunch . +-0.0011299675 mentioned . +-0.00082123175 person . +-0.0011299675 twenties . +-0.0011299675 remuneration . +-0.0011299675 independence . +-0.0011299675 easier . +-0.0011299675 ask . +-0.0011299675 Sydney . +-0.0011299675 companies . +-0.0011299675 test . +-0.0011299675 character . +-0.0011299675 thing . +-0.0003983347 economy . +-0.0011299675 attached . +-0.0011299675 common . +-0.00082123175 class . +-0.0011299675 faculty . +-0.00082123175 individual . +-0.0011299675 coursework . +-0.0011299675 discipline . +-0.0011299675 details . +-0.0011299675 co-workers . +-0.0006641202 age . +-0.0011299675 clear . +-0.0011299675 finance . +-0.00082123175 resources . +-0.0011299675 copywriting . +-0.0011299675 office . +-0.0011299675 newcomer . +-0.0011299675 mentoring . +-0.0011299675 career-oriented . +-0.0011299675 branch . +-0.0011299675 beings . +-0.00033193317 opinion . +-0.0011299675 us . +-0.0011299675 under . +-0.0011299675 allowed . +-0.0011299675 conditions . +-0.0011299675 fate . +-0.0011299675 generations . +-0.0011299675 irreversible . +-0.0011299675 nation . +-0.0011299675 production . +-0.0011299675 agendas . +-0.0011299675 accepted . +-0.0011299675 right . +-0.0011299675 self-improvement . +-0.0011299675 stage . +-0.0011299675 self-supporting . +-0.0011299675 whole . +-0.0011299675 youth . +-0.0011299675 schooling . +-0.0011299675 purposes . +-0.0011299675 desired . +-0.0011299675 classroom . +-0.0011299675 balance . +-0.0011299675 points . +-0.0011299675 socially . +-0.00082123175 careers . +-0.0011299675 cities . +-0.0011299675 ends . +-0.0011299675 stronger . +-0.0011299675 actions . +-0.0011299675 together . +-0.0011299675 lectures . +-0.0011299675 degrees . +-0.0011299675 expensive . +-0.0011299675 busy . +-0.0011299675 professionals . +-0.0011299675 sources . +-0.0011299675 faster . +-0.00082123175 children . +-0.0011299675 had . +-0.0011299675 pharmacy . +-0.0011299675 1 . +-0.0011299675 2 . +-0.0011299675 3 . +-0.0011299675 resume . +-0.0011299675 4 . +-0.0011299675 survive . +-0.0011299675 vacation . +-0.0011299675 path . +-0.0011299675 homework\/assignments . +-0.0011299675 using . +-0.0011299675 initiative . +-0.00082123175 customer . +-0.0011299675 spending . +-0.0011299675 welcome . +-0.0011299675 research . +-0.0011299675 professor . +-0.0011299675 merit . +-0.0011299675 materials . +-0.0011299675 large . +-0.0011299675 changes . +-0.0011299675 me . +-0.0011299675 goal . +-0.00082123175 rewarding . +-0.0011299675 everyday . +-0.0011299675 bill . +-0.0011299675 vacations . +-0.0011299675 employees . +-0.0011299675 multi-task . +-0.0011299675 assignment . +-0.0011299675 difficulties . +-0.0011299675 yourself . +-0.00082123175 entertainment . +-0.0011299675 porter . +-0.0006641202 opinions . +-0.00082123175 textbook . +-0.0011299675 lifetime . +-0.0011299675 glife . +-0.0011299675 clothes . +-0.0011299675 groups . +-0.0011299675 etiquette . +-0.0011299675 language . +-0.0011299675 weekends . +-0.0011299675 regimen . +-0.0011299675 absurd . +-0.0011299675 habits . +-0.0011299675 online . +-0.0011299675 mentioning . +-0.0011299675 failure . +-0.00082123175 desires . +-0.0011299675 track . +-0.0011299675 claim . +-0.0011299675 enhancements . +-0.00082123175 involved . +-0.0011299675 state . +-0.0011299675 population . +-0.0011299675 institutes . +-0.00082123175 guardians . +-0.0011299675 earners . +-0.0011299675 anyway . +-0.0011299675 negligible . +-0.0011299675 fine . +-0.0011299675 discretion . +-0.0011299675 casework . +-0.0006641202 health . +-0.0011299675 bulb . +-0.0011299675 cherished . +-0.0011299675 ideas . +-0.0011299675 peers . +-0.0011299675 exams . +-0.0011299675 larger . +-0.0011299675 threat . +-0.0011299675 properly . +-0.0011299675 exhaustion . +-0.0011299675 heavily . +-0.0011299675 smaller . +-0.0011299675 nightclubs . +-0.0011299675 Vegas . +-0.0011299675 diploma . +-0.0011299675 drug . +-0.0011299675 prizes . +-0.0011299675 proud . +-0.0011299675 goals . +-0.0011299675 holiday . +-0.0011299675 partying . +-0.0011299675 funds . +-0.00024893903 restaurants . +-0.0011299675 short-sighted . +-0.0011299675 assistants . +-0.0011299675 mastered . +-0.0011299675 barricades . +-0.0011299675 cobblestones . +-0.0011299675 discouraged . +-0.0011299675 States . +-0.0011299675 consuming . +-0.0011299675 existence . +-0.0011299675 eat . +-0.0011299675 sake . +-0.0011299675 analysis . +-0.0011299675 past . +-0.0011299675 worries . +-0.0011299675 falter . +-0.0011299675 schoolwork . +-0.0011299675 overwhelmed . +-0.0011299675 doctor . +-0.0011299675 here . +-0.0011299675 volume . +-0.0011299675 afterwards . +-0.0011299675 pessimism . +-0.0011299675 perspective . +-0.0003983347 smoking . +-0.0011299675 route . +-0.0003983347 smoke . +-0.0011299675 well-being . +-0.0011299675 patrons . +-0.0011299675 establishment . +-0.0011299675 sick . +-0.0011299675 everywhere . +-0.0011299675 justified . +-0.0011299675 citizens . +-0.00082123175 non-smokers . +-0.00082123175 altogether . +-0.0011299675 stimulant . +-0.0011299675 fair . +-0.0011299675 lifestyles . +-0.0011299675 account . +-0.0011299675 ignored . +-0.0011299675 illnesses . +-0.052752033 it ? +-0.052752033 job ? +-0.052752033 in ? +-0.052752033 experience ? +-0.037705705 students ? +-0.052752033 studies ? +-0.052752033 time ? +-0.052752033 money ? +-0.052752033 skills ? +-0.052752033 education ? +-0.052752033 world ? +-0.052752033 subjective ? +-0.052752033 straightforward ? +-0.052752033 lives ? +-0.052752033 Japan ? +-0.052752033 generally ? +-0.052752033 anything ? +-0.052752033 stage ? +-0.052752033 authorities ? +-0.052752033 governments ? +-0.052752033 competency ? +-0.052752033 attitude ? +-0.052752033 bursary ? +-0.052752033 here ? +-0.44962466 ! -RRB- +-0.11280921 more ! +-0.11280921 loans ! +-0.11280921 never ! +-0.11280921 labor ! +-0.11280921 wanted ! +-0.11280921 habits ! +-0.11280921 wrong ! +-0.11280921 guess ! +-0.11280921 properly ! +-0.11280921 Never ! +-0.11280921 unite ! +-1.456529 the individual +-1.3160119 main reasons I +-0.5247254 these reasons I +-1.2669382 is that I +-1.2391255 time that I +-2.3587446 part-time job I +-1.0731478 the job I +-1.0511867 statement , I +-0.12622261 reasons , I +-1.0511867 reason , I +-1.3933818 time , I +-1.0511867 one , I +-1.0511867 tired , I +-1.0511867 such , I +-0.5804621 Yes , I +-0.84714526 begin , I +-0.84714526 issue , I +-0.84714526 effort , I +-0.5804621 conclusion , I +-1.1480443 university , I +-1.0511867 up , I +-0.84714526 myself , I +-1.4085877 Secondly , I +-1.2448889 Thirdly , I +-0.84714526 though , I +-1.0511867 possible , I +-1.0511867 now , I +-1.0511867 But , I +-1.1480443 Japan , I +-0.84714526 anything , I +-0.84714526 days , I +-0.84714526 above , I +-0.84714526 note , I +-0.84714526 Interestingly , I +-0.84714526 Personally , I +-0.84714526 Again , I +-0.84714526 honest , I +-2.0123065 , and I +-0.93760747 today and I +-1.4648623 friends and I +-1.2065854 wasted and I +-0.93760747 here and I +-1.159431 , as I +-0.8899772 often as I +-0.8899772 off as I +-0.8899772 graduating as I +-1.2240663 this reason I +-1.038254 main reason I +-1.5444729 Many students I +-1.3439496 Therefore I +-1.1731842 , then I +-2.0851924 in college I +-1.6633887 And I +-0.9474127 years so I +-0.80001366 me : I +-0.80001366 habits : I +-0.8968427 the employers I +-0.91829836 -LRB- like I +-0.9632034 job when I +-0.79009527 grades when I +-0.79009527 discover when I +-0.79009527 car when I +-0.9006391 I what I +-0.9006391 choosing what I +-0.9422184 nor could I +-0.40157807 statement because I +-0.76916724 that because I +-0.76916724 But because I +-1.9852931 If I +-0.60589385 I whatever I +-0.60589385 on whatever I +-1.2887409 the responsibilities I +-1.1195781 something which I +-0.88862014 travel which I +-0.8834839 studies however I +-0.9323421 fs where I +-0.70085645 certain extent I +-1.437823 So I +-1.1305163 Perhaps I +-1.829635 I think I +-0.61290467 reasons why I +-0.9393757 is why I +-1.0088986 In Australia I +-0.70085645 in summary I +-0.5384965 Overall I +-0.62205684 When I +-0.5384965 and justice I +-0.5657586 I wish I +-0.70085645 the U.S. I +-1.0058641 I disagree +-0.7055833 I partially disagree +-0.2141374 I disagree with +-0.6019163 partially disagree with +-0.9506665 questing student with +-1.6654129 -RRB- , with +-0.9374403 parents , with +-0.9374403 colleges , with +-1.810724 Secondly , with +-1.7096972 Finally , with +-0.9374403 challenging , with +-0.9374403 skilled , with +-1.2484614 the case with +-1.2364163 fit in with +-0.9534104 fall in with +-2.1013653 , and with +-1.2205917 personality and with +-0.94509083 past and with +-1.1840563 experience working with +-0.9253351 skills working with +-1.4620533 many students with +-2.029896 college students with +-0.4390715 provide students with +-0.9030033 provides students with +-1.1443504 assist students with +-1.6048807 more time with +-0.9481136 first jobs with +-1.2679636 to work with +-0.19646241 I agree with +-0.32449543 completely agree with +-0.3763395 strongly agree with +-0.32449543 largely agree with +-0.32449543 totally agree with +-1.0599563 To begin with +-0.95303875 positions are with +-0.8756548 social experiences with +-0.69540316 a team with +-0.8493379 are productive with +-0.6775056 the relationships with +-0.6775056 friendly relationships with +-0.9461062 viable people with +-0.8730678 especially true with +-1.3858857 will help with +-1.1046833 to help with +-1.4997087 to do with +-0.5347369 be familiar with +-0.33900368 will interfere with +-0.33900368 jobs interfere with +-0.9249069 group activities with +-1.4483202 college life with +-1.2874727 social life with +-1.0862964 taken up with +-0.8687581 ending up with +-0.5347369 often combines with +-1.1508789 of income with +-0.9169204 still living with +-0.8287243 to communicate with +-0.8756548 be responsible with +-0.69540316 be flexible with +-0.8287243 to fill with +-0.3140839 to cope with +-0.86008275 get out with +-0.42794508 hang out with +-0.91779655 For those with +-0.69540316 in comparison with +-0.8627387 The problem with +-1.219193 their families with +-0.90896845 all day with +-0.6019163 out drinking with +-0.6019163 binge drinking with +-0.5347369 and association with +-0.5347369 ultimately remain with +-0.8287243 being organized with +-0.76558566 temporary basis with +-0.5347369 Along with +-0.76558566 work schedules with +-0.8739768 actually complete with +-0.5347369 get along with +-0.8829232 be spent with +-0.5347369 to interact with +-0.5347369 by interacting with +-0.8820983 provided me with +-0.80544597 study schedule with +-0.5347369 have continued with +-0.5347369 more efficiently with +-0.5347369 are helping with +-0.087491564 , dealing with +-0.23189282 experience dealing with +-0.23189282 especially dealing with +-0.23189282 though dealing with +-0.69540316 example communicating with +-0.5347369 occupying oneself with +-0.8493379 recreational habits with +-0.5347369 hopefully tie-in with +-0.8287243 my claim with +-0.5347369 in cooperation with +-0.69540316 nothing wrong with +-0.5347369 double bonus with +-0.5347369 exhaustion interferes with +-0.5347369 become trapped with +-0.5347369 nationwide boom with +-0.92012227 to restaurants with +-0.5347369 evolving relationship with +-0.5347369 by interfering with +-0.5347369 be burdened with +-0.5347369 work-force coupled with +-0.5347369 not diluted with +-0.5347369 schedules associated with +-0.5347369 college rests with +-0.5347369 be held with +-0.5347369 I interviewed with +-0.5347369 been implemented with +-0.5347369 becomes entangled with +-0.09438104 disagree with this +-0.33191696 agree with this +-0.5369658 reasons for this +-1.366326 is for this +-1.5405154 reason for this +-2.6119888 part-time job this +-0.8991957 therefore upon this +-1.2140353 graduation , this +-1.7340859 work , this +-1.2140353 so , this +-1.2140353 countries , this +-0.9416018 Nevertheless , this +-0.9416018 although , this +-1.2140353 said , this +-2.0019064 In this +-0.92426234 valuable in this +-1.182112 especially in this +-1.3189304 helpful in this +-0.92426234 survive in this +-0.92426234 progress in this +-2.3421571 , and this +-1.24806 responsibility and this +-1.4483817 level of this +-0.9371589 benefits of this +-1.5330589 all of this +-1.2057524 result of this +-1.4635594 member of this +-0.96367604 answer to this +-0.96367604 similarities to this +-1.2611108 For this +-0.9573892 Apart from this +-0.96097445 action on this +-1.3506871 and then this +-0.88054943 time but this +-1.1059301 work but this +-0.87144387 universities consider this +-0.929042 because at this +-0.929042 unit at this +-0.9315946 By doing this +-1.244445 to do this +-2.0057971 I believe this +-0.8397957 themselves during this +-0.8397957 goal during this +-0.53858435 to maintain this +-0.92397624 emphatically support this +-1.3360801 I feel this +-0.9380305 help ease this +-0.838894 Even though this +-1.1308596 Perhaps this +-1.238985 , since this +-1.3393935 reasons why this +-0.53858435 I admit this +-1.040849 us realize this +-0.7009841 should recognize this +-0.838894 , certainly this +-0.83632827 Doing this +-0.53858435 Interrupting this +-0.53858435 Disrupting this +-0.7009841 simply distract this +-0.53858435 Keeping this +-0.1721986 with this statement +-1.2525237 with the statement +-1.7855558 against the statement +-2.309394 The statement +-1.1388981 the above statement +-0.42091483 this statement for +-0.7177357 above statement for +-0.8929262 more reasons for +-0.7874274 two reasons for +-0.9943556 main reasons for +-1.0563738 three reasons for +-1.4944385 understand that for +-1.8981637 part-time job for +-1.3720853 full-time job for +-1.1079654 any job for +-1.7610698 time job for +-0.9324332 necessary is for +-1.845926 It is for +-0.9324332 purpose is for +-0.15084541 valuable preparation for +-0.15084541 and preparation for +-0.15084541 good preparation for +-0.15084541 effective preparation for +-0.8755664 even full-time for +-1.6979122 students will for +-1.6815472 job , for +-1.4819789 student , for +-1.3527664 cases , for +-0.9053896 in , for +-1.1485188 restaurant , for +-1.5467778 work , for +-1.4497615 skills , for +-0.9053896 position , for +-1.2735283 friends , for +-1.3527664 addition , for +-1.1485188 possible , for +-1.1485188 end , for +-0.9053896 demons , for +-0.94140446 studies not for +-0.9451128 style and for +-1.6869276 go to for +-1.7908659 to study for +-1.0955393 and studying for +-1.2042238 on studying for +-1.3789804 valuable experience for +-1.3021469 life experience for +-1.3446298 , working for +-1.1398095 of working for +-0.8430071 as working for +-0.8430071 unhappiness working for +-0.54454136 ample reason for +-0.54454136 good reason for +-0.54454136 other reason for +-0.6343255 main reason for +-0.54454136 significant reason for +-0.5208086 a time for +-0.7114771 fs time for +-0.89037335 arranging time for +-0.89037335 opportune time for +-1.1112103 in school for +-2.069486 to work for +-0.9150423 income -LRB- for +-1.3662076 , but for +-1.049825 not only for +-0.87492377 used only for +-0.5469526 it necessary for +-0.5548569 is necessary for +-0.5469526 are necessary for +-0.5469526 financially necessary for +-0.47942734 and pay for +-0.43821508 to pay for +-0.47942734 students pay for +-0.3791692 help pay for +-0.47942734 them pay for +-0.8578552 the fees for +-1.0702064 enough money for +-1.1181052 need money for +-1.2153544 earn money for +-0.8064731 get money for +-1.1584615 save money for +-1.5568779 And for +-0.7177357 as bad for +-0.7177357 so bad for +-0.7562072 employment market for +-0.8528046 it does for +-0.91749346 candidates even for +-0.5340118 the foundation for +-0.46128714 real foundation for +-0.6874113 work history for +-0.6874113 poor candidate for +-0.8345 be better for +-0.8345 is better for +-1.2066886 their skills for +-0.18866794 is important for +-0.31676692 not important for +-0.23167701 very important for +-0.54089487 only important for +-0.6298459 so important for +-0.6298459 are important for +-0.41808233 fs important for +-0.54089487 extreme important for +-0.54089487 necessarily important for +-0.54089487 indeed important for +-0.54089487 terribly important for +-0.8560704 safety issue for +-0.91225535 save enough for +-1.0377406 this need for +-1.0377406 the need for +-0.41920686 be good for +-0.84577477 fs good for +-0.75781655 bad idea for +-0.64838374 good idea for +-1.8564329 their parents for +-0.8610918 particularly true for +-1.9295031 to do for +-0.84368944 first chance for +-0.626021 an opportunity for +-0.45628625 important opportunity for +-0.45628625 good opportunity for +-0.45628625 wonderful opportunity for +-0.7349327 benefit them for +-0.39060903 prepares them for +-0.8833846 preparing them for +-0.7349327 interest them for +-1.0046635 assist them for +-0.7349327 prepare them for +-0.8719379 be useful for +-0.69552195 being done for +-0.69552195 things done for +-0.9748734 not fit for +-0.9302548 one get for +-0.5292007 woefully unprepared for +-0.8924875 important responsibilities for +-0.84368944 be used for +-0.83799046 often late for +-1.1181374 be difficult for +-0.7951765 of cash for +-0.8946644 needed income for +-0.9005147 great way for +-0.5225355 being responsible for +-0.5225355 institutions responsible for +-0.5225355 Being responsible for +-0.7951765 good arguments for +-0.7562072 working '' for +-1.5688201 the value for +-0.69859153 promotes value for +-0.5292007 the passion for +-0.8610918 is great for +-0.6874113 our stand for +-0.7562072 or contacts for +-1.1763914 the best for +-0.9136367 that matter for +-0.7951765 prepare him for +-1.1588516 very hard for +-1.3340806 Working for +-0.9136367 course material for +-0.7951765 of concern for +-0.8203553 working hold for +-0.5292007 to traverse for +-0.83513504 good things for +-1.1243541 these things for +-0.5292007 are paying for +-0.8203553 monthly rent for +-1.0737171 be made for +-0.6874113 Getting ready for +-1.1366102 the day for +-0.6874113 causing lost for +-0.787526 having responsibility for +-0.40722078 take responsibility for +-0.5292007 an identity for +-0.5292007 to handle for +-0.8705034 to independence for +-0.7951765 very positive for +-0.5378425 is common for +-0.6874113 of applications for +-0.54051083 job possibilities for +-0.54051083 career possibilities for +-0.5292007 an aptitude for +-0.7562072 richer basis for +-0.83799046 waiting around for +-0.6985951 is essential for +-0.5960629 sleep essential for +-0.7562072 fs budget for +-0.9136367 a distraction for +-0.6874113 necessary conditions for +-0.5292007 No excuses for +-1.17688 the right for +-0.7858867 truly right for +-0.8560704 and save for +-0.54051083 and respect for +-0.54051083 much-needed respect for +-0.8203553 this policy for +-0.6874113 is mainly for +-0.5378425 to prepare for +-0.5292007 better prepared for +-0.7562072 is critical for +-0.5292007 have empathy for +-0.5974354 are required for +-0.5974354 absolutely required for +-0.5292007 no replacement for +-0.5292007 a struggle for +-0.45125908 to look for +-0.5960629 employers look for +-0.5292007 , looking for +-0.7562072 significant expense for +-0.7562072 student loan for +-0.82265806 and materials for +-0.7562072 be low for +-0.759522 be rewarding for +-0.7951765 really wanted for +-0.5292007 more options for +-0.5292007 and grandparents for +-0.5292007 in exchange for +-0.5292007 to arrange for +-0.46128714 everything paid for +-0.46128714 already paid for +-0.5292007 Searching for +-0.5292007 and begging for +-0.5292007 be traumatizing for +-0.5292007 permanent bond for +-0.5292007 working dogs for +-0.5292007 is ideal for +-0.5292007 so disparate for +-0.5292007 when applying for +-0.5292007 the chances for +-0.5292007 headlong charge for +-0.5292007 a tool for +-0.6874113 merely finishing for +-0.5292007 and carrying for +-0.5292007 have availability for +-1.3005892 second-hand smoke for +-0.5292007 ventilated room for +-0.6874113 reasonable justification for +-0.6874113 too smoky for +-0.5292007 seek treatment for +-1.6239449 statement for several +-1.8080971 , for several +-0.93188775 and for several +-1.1960192 school for several +-2.2302954 can be several +-1.2695282 attendant , several +-1.9413931 For several +-1.653497 there are several +-2.0937102 There are several +-0.895713 jobs offer several +-2.1425383 to do several +-0.8766979 Why waste several +-1.4082516 I had several +-0.17254661 for several reasons +-0.59077865 For several reasons +-0.69191444 are several reasons +-1.2595067 For the reasons +-1.6376215 all the reasons +-1.7060202 are many reasons +-1.2561314 list of reasons +-0.9260106 number of reasons +-2.2605135 The reasons +-0.44694287 for financial reasons +-0.656295 the following reasons +-0.9197985 gives more reasons +-0.9197985 far more reasons +-0.18785457 for two reasons +-0.2864937 have two reasons +-0.2864937 following two reasons +-0.32457042 are two reasons +-0.2864937 these two reasons +-0.6782402 the main reasons +-0.39012584 two main reasons +-0.24154592 For these reasons +-0.84276944 for various reasons +-0.3920536 For three reasons +-0.5528344 have three reasons +-0.24584863 following three reasons +-0.3920536 are three reasons +-1.2401941 are my reasons +-0.5665353 the above reasons +-0.7038017 many excellent reasons +-1.4045504 I disagree . +-0.49357885 for this . +-0.8107263 believe this . +-0.8107263 realize this . +-0.6846582 this statement . +-1.379179 money for . +-0.8960209 hard for . +-1.1322505 paid for . +-0.295592 several reasons . +-0.43527922 of reasons . +-0.664778 financial reasons . +-0.43527922 following reasons . +-0.62383306 two reasons . +-0.56914186 various reasons . +-0.54445565 three reasons . +-0.73059213 from it . +-0.73059213 accomplish it . +-0.73059213 have it . +-0.73059213 into it . +-0.8772875 doing it . +-0.8772875 earn it . +-0.73059213 get it . +-0.73059213 hinder it . +-0.73059213 rush it . +-0.73059213 repay it . +-1.4932667 work part-time . +-0.8785951 a job . +-0.61936116 part-time job . +-1.0975626 the job . +-0.80918926 time job . +-0.9005563 's job . +-0.9667227 first job . +-0.9005563 good job . +-0.74706256 particular job . +-0.74706256 her job . +-1.489515 it is . +-1.1554924 a student . +-1.2609388 the student . +-1.2552696 college student . +-0.7922149 productive student . +-0.96636486 individual student . +-0.41145796 upon graduation . +-0.17986998 their graduation . +-0.3733292 following graduation . +-0.4249236 after graduation . +-0.3733292 her graduation . +-1.6125562 do not . +-0.5277839 the case . +-0.6069651 to case . +-0.43513206 for employment . +-0.54179215 part-time employment . +-0.50325406 full-time employment . +-0.50325406 of employment . +-0.50325406 time employment . +-0.26768222 future employment . +-0.43513206 subsequent employment . +-1.1636539 fit in . +-1.1636539 engaging in . +-0.53835267 chosen field . +-0.53835267 career field . +-0.53835267 respective field . +-0.53835267 his\/her field . +-0.53835267 appropriate field . +-1.397409 expected to . +-0.70732135 for study . +-0.8450061 and study . +-0.42896888 of study . +-0.59028286 to study . +-0.70732135 good study . +-0.34125715 is studying . +-0.70302737 and studying . +-0.7438299 on studying . +-0.7438299 time studying . +-0.8656683 while studying . +-0.70302737 were studying . +-0.7484637 mechanical engineering . +-0.6160079 job experience . +-0.72400934 on experience . +-0.7091416 work experience . +-0.72400934 college experience . +-0.6160079 some experience . +-0.72400934 useful experience . +-0.6160079 relevant experience . +-0.72400934 actual experience . +-0.6160079 learning experience . +-1.1322653 borrowed from . +-0.90298533 for working . +-0.74876696 not working . +-1.0071001 and working . +-0.9695527 students working . +-0.90298533 while working . +-0.74876696 than working . +-0.7975685 a restaurant . +-0.947394 the restaurant . +-0.8905562 served there . +-1.0849273 for students . +-1.0940471 of students . +-1.0662119 to students . +-0.78037727 as students . +-0.78037727 on students . +-0.9488044 school students . +-0.78037727 such students . +-1.3402902 college students . +-0.64520156 university students . +-1.0940471 young students . +-0.78037727 8,000 students . +-0.78037727 unproductive students . +-0.6069651 cause stress . +-0.6069651 added stress . +-1.2498708 it on . +-1.1306587 moving on . +-0.5157006 of studies . +-0.53724647 their studies . +-0.59912324 on studies . +-0.40237722 's studies . +-0.5157006 academic studies . +-0.59912324 my studies . +-0.59912324 his studies . +-0.5157006 daily studies . +-1.1732457 their time . +-1.0800745 on time . +-1.135574 more time . +-0.617831 same time . +-1.1340048 full time . +-0.9526299 takes time . +-0.9526299 transitional time . +-0.7829694 reasonable time . +-0.5507813 part-time jobs . +-0.47453338 time jobs . +-0.8301784 regular jobs . +-0.8872637 having one . +-0.77748513 physically tired . +-1.0576631 the focus . +-0.58467734 the school . +-0.40699768 in school . +-0.58467734 to school . +-0.50374496 on school . +-0.45111966 at school . +-0.30025434 after school . +-0.58467734 particular school . +-0.50374496 quit school . +-0.50374496 cram school . +-0.58467734 attending school . +-0.752509 part-time work . +-1.0688471 of work . +-0.9700501 to work . +-0.89401615 their work . +-0.7424593 should work . +-0.89401615 extra work . +-0.7424593 or work . +-0.7424593 future work . +-0.528196 hard work . +-0.7424593 regular work . +-0.50325936 being produced . +-0.8965931 and such . +-0.37338907 ? -RRB- . +-0.6845809 fees -RRB- . +-0.6845809 environment -RRB- . +-0.6845809 questions -RRB- . +-0.95817447 are pursuing . +-1.5916259 they should . +-0.8511824 or necessary . +-1.8459793 for college . +-0.69476753 in college . +-0.7343893 leave college . +-0.94590425 through college . +-0.5827514 at college . +-0.32396242 after college . +-0.7343893 afford college . +-0.7343893 finish college . +-0.7343893 attending college . +-0.7343893 throughout college . +-1.6480026 they have . +-0.3572138 for money . +-0.8466983 of money . +-0.26315156 making money . +-0.67504656 earn money . +-0.7596949 earning money . +-0.6435809 without money . +-0.47847462 pocket money . +-0.7596949 my money . +-0.6435809 choose money . +-0.7596949 prize money . +-0.8027881 they say . +-0.8690722 Thank you . +-1.2939248 for some . +-0.41742966 taste bad . +-0.8027881 difficulties begin . +-0.81157017 happing today . +-0.6793639 or degree . +-0.6793639 four-year degree . +-0.61984944 this well . +-0.61984944 it well . +-0.5410564 as well . +-0.61984944 pays well . +-0.8903465 themselves are . +-0.5346242 a career . +-0.429515 their career . +-0.4966902 future career . +-0.4966902 successful career . +-0.429515 my career . +-0.429515 targeted career . +-0.5011351 practical experiences . +-0.5011351 learning experiences . +-0.5011351 invaluable experiences . +-1.0057445 to offer . +-0.68226635 financial benefit . +-0.68226635 another benefit . +-0.5413015 the future . +-0.46826607 their future . +-0.73786956 my future . +-0.6267783 brighter future . +-0.8032087 will consider . +-0.51694524 work skill . +-0.51694524 life skill . +-0.8485394 purely academic . +-0.8455552 time more . +-0.8455552 want more . +-0.8235534 potential employers . +-0.85266644 the better . +-0.7484637 the organization . +-1.0071819 the industry . +-0.64565223 study skills . +-0.7624031 interpersonal skills . +-0.64565223 management skills . +-0.42623967 social skills . +-0.64565223 personal skills . +-0.80951875 these skills . +-0.64565223 employability skills . +-1.2777508 is important . +-0.7238629 and important . +-0.9289007 more important . +-0.86788344 all important . +-0.7238629 personally important . +-0.7133119 potential employer . +-0.6373454 time management . +-0.46671915 financial management . +-0.46671915 business management . +-0.85575247 are taking . +-0.50325936 criticism positively . +-1.0718262 go through . +-0.7867131 not productive . +-0.68192095 it themselves . +-0.68192095 positions themselves . +-0.68192095 teach themselves . +-0.8375578 school environment . +-0.61202127 a position . +-0.5011351 graduate position . +-0.5011351 each position . +-0.7133119 their abilities . +-1.4454706 the same . +-0.68226635 benefiting financially . +-0.68226635 succeed financially . +-0.64280033 the issue . +-0.46671915 money issue . +-0.46671915 big issue . +-0.86383677 fast enough . +-1.1841305 the food . +-0.56869084 at times . +-0.56869084 vacation times . +-0.987681 this need . +-0.987681 they need . +-0.50325936 an emergency . +-0.664216 is earned . +-0.664216 have earned . +-0.85529745 and good . +-0.703099 good idea . +-0.9310391 with others . +-0.68518823 many others . +-0.7976186 is like . +-0.63745725 the like . +-0.63745725 tastes like . +-0.8204796 their parents . +-0.99843067 your parents . +-0.8901183 with friends . +-0.73970604 and friends . +-0.63991165 crucial relationships . +-0.80117214 workplace relationships . +-1.0943304 of people . +-0.96615595 young people . +-0.78051484 rounded people . +-0.78051484 rights people . +-0.80718005 is true . +-1.0119214 from College . +-0.65054137 financially stable . +-1.3397994 I do . +-0.83771735 doing something . +-0.76881135 an advantage . +-0.86028135 too high . +-0.8576102 attend classes . +-1.1169974 n't know . +-0.20840219 don ft. . +-0.20840219 doesn ft. . +-0.20840219 hadn ft. . +-1.3532796 at all . +-1.1163046 for them . +-0.62139887 to them . +-0.74332404 kill them . +-0.74332404 serve them . +-0.74332404 impress them . +-0.50325936 their futures . +-0.5662341 the society . +-0.5956286 in society . +-0.4393193 of society . +-0.5662341 to society . +-0.4883827 broader society . +-0.5662341 modern society . +-0.65054137 highly specialized . +-0.50325936 of contexts . +-0.76881135 of specialty . +-0.7710068 by itself . +-0.50325936 with specialization . +-0.65054137 chosen fields . +-0.32336983 of temptation . +-0.32336983 sample temptation . +-1.0462253 social activities . +-0.7932434 community activities . +-0.50325936 students socialize . +-0.50325936 unpredictable results . +-0.65054137 and teachers . +-0.7484637 society demands . +-0.39026454 for life . +-0.53280646 in life . +-0.39026454 of life . +-0.66592014 their life . +-0.66592014 school life . +-0.5452266 college life . +-0.570058 real life . +-0.7773271 fs life . +-0.32918224 daily life . +-0.570058 private life . +-0.48235404 the workforce . +-0.52737445 additional responsibilities . +-0.73530406 social responsibilities . +-0.78031975 leave university . +-0.40502557 through university . +-0.8713421 passed up . +-0.7484637 be sleeping . +-0.50325936 immediate repercussions . +-1.063004 often difficult . +-0.7484637 adult lifestyle . +-0.38657898 very helpful . +-0.91357267 this regard . +-0.68226635 family finances . +-0.68226635 his finances . +-0.65054137 finances responsibly . +-1.1189742 credit cards . +-0.79230475 tuition loans . +-0.61316794 , etc. . +-0.44241893 uniforms etc. . +-0.50325936 financial peril . +-0.7417742 an income . +-0.7417742 extra income . +-0.50325936 such debts . +-0.5602273 living expenses . +-0.6390361 with family . +-0.26203418 their family . +-0.6390361 own family . +-1.071505 an adult . +-0.5803078 and development . +-0.5803078 adult development . +-1.073351 become independent . +-0.43274358 they graduate . +-0.78140795 we graduate . +-0.81818664 educated professional . +-0.62786806 for customers . +-0.78382015 of customers . +-0.62786806 handling customers . +-0.8197677 and responsible . +-0.3000333 academic achievement . +-0.65054137 of necessity . +-0.815245 economical sense . +-0.88206494 work force . +-0.50325936 of staff . +-0.5214464 personal situation . +-0.5214464 own situation . +-0.4167356 the year . +-0.6039533 of education . +-0.3926142 their education . +-0.545997 an education . +-0.40555257 college education . +-0.4714027 's education . +-0.4714027 advanced education . +-0.4714027 proper education . +-0.4714027 our education . +-0.4714027 further education . +-0.4714027 incomplete education . +-0.74969053 the world . +-0.7251747 working world . +-0.76822126 academic world . +-0.85924894 real world . +-0.7251747 adult world . +-0.69812334 considerable success . +-0.83242613 academic success . +-0.95817447 smoking entirely . +-0.63991165 its importance . +-0.63991165 utmost importance . +-1.0488918 his own . +-0.50325936 own pockets . +-0.8027881 this transition . +-0.56869084 the campus . +-0.43500805 on campus . +-0.50325936 are cultivated . +-0.8451183 live without . +-0.8541513 all about . +-0.50325936 salable product . +-0.75616264 is limited . +-0.7484637 class assignments . +-0.7982902 help either . +-0.3697228 , too . +-0.674335 bad too . +-0.674335 life too . +-0.65054137 most complicated . +-0.46671915 or business . +-0.46671915 about business . +-0.46671915 ' business . +-0.76881135 work ethic . +-1.5568883 to learn . +-0.6433213 important lessons . +-0.6433213 beneficial lessons . +-0.50325936 be exhausting . +-0.50325936 priorities straight . +-1.1014975 to quit . +-0.28301305 be beneficial . +-0.46671915 job beneficial . +-0.46671915 personally beneficial . +-1.1137376 as possible . +-0.776974 later years . +-0.776974 senior years . +-0.7133119 of medicine . +-0.82945204 give three . +-0.7133119 life lesson . +-0.84143203 used before . +-0.65054137 from organizations . +-0.50325936 college gates . +-1.1286163 to meet . +-1.0399125 something new . +-0.85632753 there against . +-0.7484637 work load . +-0.51694524 their performance . +-0.51694524 academic performance . +-0.99687827 their books . +-0.50325936 a hindrance . +-0.8287384 never worked . +-0.6954135 really worked . +-0.65054137 lower workload . +-0.42071033 financial needs . +-0.58183706 the costs . +-0.6462078 tuition costs . +-0.98288625 to afford . +-0.44241893 educational process . +-0.44241893 learning process . +-0.7710068 the system . +-0.53399396 into practice . +-0.2624939 as adults . +-0.424702 responsible adults . +-0.424702 mature adults . +-0.50325936 college counterparts . +-0.97546875 a problem . +-0.81818664 day since . +-0.7133119 be concerned . +-0.65054137 less sophisticated . +-0.79230475 an effect . +-0.4654606 the workplace . +-0.73722285 f workplace . +-0.815245 studies instead . +-0.85325277 course material . +-0.7484637 principle concern . +-0.65054137 full-time worker . +-0.424702 money problems . +-0.424702 serious problems . +-0.424702 respiratory problems . +-1.0845375 their families . +-0.50325936 courses began . +-0.95817447 so far . +-0.50325936 fully immersed . +-0.7133119 to rest . +-1.0111022 their lives . +-0.38277295 working lives . +-1.1328908 they want . +-0.7520951 we want . +-0.7279554 many things . +-0.7279554 other things . +-0.6190821 those things . +-0.3489827 new things . +-0.50325936 of drinks . +-0.7484637 study harder . +-0.65054137 to suffer . +-0.44241893 be fun . +-0.44241893 having fun . +-0.95817447 simple tasks . +-0.50325936 n't fatal . +-0.5645745 in Japan . +-0.60063195 in America . +-0.60063195 North America . +-0.53198135 in freedom . +-0.61893153 of freedom . +-0.53198135 more freedom . +-1.0723064 to think . +-0.8032087 do anything . +-1.1815065 reasons why . +-0.91210526 to attend . +-0.30621156 time wisely . +-0.51694524 spend wisely . +-0.50325936 precious commodity . +-1.1585519 20 hours . +-0.65054137 5 . +-0.6101753 student days . +-0.7956457 college days . +-0.911714 these days . +-0.93419963 a week . +-0.8841455 per week . +-0.7484637 quit easily . +-0.5252333 that day . +-0.6107019 the day . +-0.5252333 one day . +-0.5252333 each day . +-0.65054137 and coworkers . +-0.50325936 rapidly underway . +-0.8578608 get home . +-0.5349836 of responsibility . +-0.7504482 financial responsibility . +-1.0482935 doing homework . +-0.65054137 properly completed . +-0.7133119 at lunch . +-0.7133119 fve mentioned . +-0.7115423 to person . +-0.7115423 better person . +-0.50325936 early twenties . +-0.50325936 of remuneration . +-1.0919538 financial independence . +-0.7133119 and easier . +-0.85325277 to ask . +-0.50325936 Technology Sydney . +-0.80718005 change companies . +-0.65054137 perfect test . +-0.76881135 their character . +-0.65054137 positive thing . +-0.11020566 the economy . +-0.20840219 local economy . +-0.20840219 general economy . +-0.50325936 stay attached . +-0.7133119 very common . +-0.5857135 in class . +-0.50325936 and faculty . +-1.0184579 the individual . +-0.7206579 responsible individual . +-0.65054137 to coursework . +-0.8233063 and discipline . +-0.50325936 to details . +-0.7484637 their co-workers . +-0.5563739 and age . +-0.5563739 earlier age . +-0.5563739 correct age . +-0.65054137 made clear . +-0.65054137 of finance . +-0.51694524 and resources . +-0.51694524 economic resources . +-0.50325936 or copywriting . +-0.65054137 an office . +-0.50325936 a newcomer . +-0.50325936 and mentoring . +-0.50325936 are career-oriented . +-0.50325936 a branch . +-0.50325936 human beings . +-0.33312583 this opinion . +-0.44134372 the opinion . +-0.83085275 my opinion . +-0.7982902 around us . +-0.65054137 live under . +-1.0603516 be allowed . +-0.65054137 any conditions . +-0.50325936 fs fate . +-0.65054137 for generations . +-0.50325936 be irreversible . +-0.50325936 entire nation . +-0.50325936 the production . +-0.50325936 their agendas . +-0.50325936 be accepted . +-0.8453387 job right . +-0.50325936 perfect self-improvement . +-0.7484637 curious stage . +-0.50325936 financially self-supporting . +-0.65054137 a whole . +-0.86640126 the youth . +-0.65054137 their schooling . +-0.65054137 relaxation purposes . +-0.7133119 be desired . +-0.76881135 the classroom . +-0.81993985 a balance . +-0.65054137 three points . +-0.65054137 and socially . +-0.6060957 their careers . +-0.5214464 professional careers . +-0.65054137 multicultural cities . +-0.50325936 college ends . +-0.65054137 one stronger . +-0.50325936 her actions . +-0.77748513 time together . +-0.7227986 in lectures . +-0.50325936 their degrees . +-0.7133119 very expensive . +-0.7133119 are busy . +-0.50325936 busy professionals . +-0.50325936 other sources . +-0.65054137 degree faster . +-0.82629454 their children . +-0.63991165 his children . +-1.2318772 I had . +-0.50325936 local pharmacy . +-0.65054137 1 . +-0.7133119 2 . +-0.7133119 3 . +-0.91210526 a resume . +-0.65054137 4 . +-0.76881135 to survive . +-0.65054137 of vacation . +-0.65054137 career path . +-0.50325936 their homework\/assignments . +-0.7484637 are using . +-0.65054137 own initiative . +-0.5160897 the customer . +-0.81157017 unreasonable spending . +-0.7133119 be welcome . +-0.7133119 of research . +-0.65054137 their professor . +-0.65054137 some merit . +-0.94454867 educational materials . +-0.9889069 at large . +-0.65054137 environment changes . +-0.81818664 to me . +-0.7710068 original goal . +-0.44241893 very rewarding . +-0.44241893 quite rewarding . +-0.65054137 there everyday . +-0.50325936 the bill . +-0.50325936 summer vacations . +-0.7710068 the employees . +-0.50325936 to multi-task . +-0.50325936 an assignment . +-0.7133119 financial difficulties . +-0.50325936 of yourself . +-0.44241893 simply entertainment . +-0.44241893 including entertainment . +-0.50325936 hotel porter . +-0.44241893 these opinions . +-0.27126983 three opinions . +-0.32336983 a textbook . +-0.32336983 the textbook . +-0.50325936 a lifetime . +-0.50325936 about glife . +-0.7133119 and clothes . +-0.7484637 study groups . +-0.65054137 social etiquette . +-0.50325936 spoken language . +-0.7484637 at weekends . +-0.50325936 work regimen . +-0.50325936 as absurd . +-0.95817447 bad habits . +-0.7133119 learning online . +-0.50325936 bear mentioning . +-0.50325936 as failure . +-0.32336983 and desires . +-0.32336983 fs desires . +-0.50325936 right track . +-0.76881135 my claim . +-0.50325936 life-skill enhancements . +-0.32336983 is involved . +-0.32336983 everyone involved . +-0.50325936 mental state . +-0.65054137 the population . +-0.65054137 loan institutes . +-0.3000333 or guardians . +-0.50325936 household earners . +-0.65054137 opinion anyway . +-0.65054137 almost negligible . +-0.50325936 be fine . +-0.50325936 ' discretion . +-0.50325936 or casework . +-0.6639529 's health . +-0.6639529 own health . +-0.6639529 physical health . +-0.50325936 light bulb . +-0.50325936 be cherished . +-0.75616264 new ideas . +-0.50325936 their peers . +-0.65054137 their exams . +-0.65054137 grow larger . +-0.50325936 a threat . +-0.7484637 jobs properly . +-0.65054137 from exhaustion . +-0.7133119 profits heavily . +-0.65054137 and smaller . +-0.65054137 and nightclubs . +-0.76881135 Las Vegas . +-0.76881135 a diploma . +-0.50325936 addictive drug . +-0.50325936 cash prizes . +-0.50325936 parents proud . +-0.65054137 term goals . +-0.50325936 working holiday . +-0.7133119 or partying . +-0.50325936 partying funds . +-0.7130377 in restaurants . +-0.3112425 all restaurants . +-0.68192095 includes restaurants . +-0.50325936 and short-sighted . +-0.50325936 teaching assistants . +-0.50325936 and mastered . +-0.50325936 new barricades . +-0.50325936 our cobblestones . +-0.50325936 be discouraged . +-0.50325936 United States . +-0.50325936 time consuming . +-0.50325936 healthy existence . +-0.7710068 will eat . +-0.50325936 finishing sake . +-0.50325936 and analysis . +-0.65054137 times past . +-0.65054137 mundane worries . +-0.50325936 will falter . +-0.50325936 their schoolwork . +-0.50325936 becoming overwhelmed . +-0.50325936 a doctor . +-0.7484637 examined here . +-0.50325936 in volume . +-0.50325936 shift afterwards . +-0.50325936 even pessimism . +-0.50325936 their perspective . +-0.61913055 of smoking . +-0.61913055 themselves smoking . +-0.61913055 always smoking . +-0.61913055 regular smoking . +-0.61913055 discourage smoking . +-0.50325936 middle route . +-0.6359399 of smoke . +-0.7499746 to smoke . +-0.6359399 still smoke . +-0.6359399 secondhand smoke . +-0.76881135 and well-being . +-0.85325277 restaurant patrons . +-0.50325936 non-smoking establishment . +-0.50325936 them sick . +-0.65054137 law everywhere . +-0.65054137 perfectly justified . +-0.50325936 healthy citizens . +-0.51694524 on non-smokers . +-0.51694524 are non-smokers . +-0.32336983 banned altogether . +-0.32336983 restaurants altogether . +-0.50325936 a stimulant . +-0.50325936 not fair . +-0.50325936 own lifestyles . +-0.50325936 into account . +-0.50325936 be ignored . +-0.50325936 tobacco-related illnesses . +-1.5517123 three reasons it +-1.1186665 While it +-0.95497584 nor can it +-0.8180224 disagree that it +-1.0054915 statement that it +-1.3052976 is that it +-0.6709817 , that it +-0.42752212 agree that it +-0.81400126 believe that it +-0.8180224 imply that it +-0.8180224 power that it +-1.2308744 think that it +-0.8180224 homework that it +-1.0916454 fact that it +-1.0916454 opinion that it +-0.94197637 f is it +-0.94197637 but is it +-0.94197637 or is it +-2.0104768 college student it +-1.4915415 job , it +-1.2763928 and , it +-1.2435931 course , it +-1.4696465 students , it +-1.3637582 Therefore , it +-1.4043024 college , it +-1.4307375 However , it +-1.2435931 activities , it +-1.3637582 life , it +-0.8632002 result , it +-1.113926 Secondly , it +-1.3170793 however , it +-1.1807604 year , it +-0.8632002 straightforward , it +-1.2435931 second , it +-0.8632002 enter , it +-1.0771625 Lastly , it +-0.93569726 First , it +-1.0771625 opinion , it +-0.8632002 wealthy , it +-0.8632002 Hence , it +-0.42878085 Zealand , it +-0.8632002 liked , it +-0.8632002 extremes , it +-0.77654874 smoke , it +-0.9246425 and it +-0.9246425 valuable and it +-1.7451524 money and it +-1.1828005 responsibility and it +-1.1828005 wasted and it +-0.9246425 appetite and it +-0.9611059 any of it +-0.9595606 see to it +-0.9595606 violently to it +-0.9517632 occur from it +-1.3921618 by working it +-1.115167 distractions as it +-0.88602304 pressures as it +-1.115167 much as it +-0.88602304 expensive as it +-1.2043576 work there it +-2.5642097 college students it +-0.99991333 to accomplish it +-1.0570117 to leave it +-0.8994242 sought if it +-1.3362575 even if it +-1.7105911 , then it +-0.79590094 job but it +-1.1138029 , but it +-0.79590094 -- but it +-1.9592355 I agree it +-1.9183798 students have it +-1.7063701 extra money it +-1.1985786 And it +-0.8668363 , making it +-1.4630674 for some it +-0.8651415 what does it +-0.9329084 major or it +-1.4347374 class or it +-0.49303538 and makes it +-0.42638204 environment makes it +-0.42638204 This makes it +-0.42638204 indirectly makes it +-0.9215721 by themselves it +-0.9333172 enter into it +-0.91616297 isn ft it +-1.1837178 and spend it +-1.3979636 to spend it +-1.5707034 However it +-0.8435367 is doing it +-0.8435367 while doing it +-0.9460511 whatsoever when it +-1.1536856 often do it +-0.9083339 and do it +-1.2076522 to earn it +-0.9333344 So while it +-0.92086303 should take it +-0.8963454 we know it +-1.1725395 and what it +-0.91135865 of what it +-0.8321829 learn what it +-0.60934806 job because it +-0.7154875 is because it +-0.60934806 jobs because it +-0.60934806 responsibility because it +-0.60934806 ages because it +-1.2274874 of them it +-1.3135337 If it +-0.83382165 for whatever it +-0.5354339 Because it +-1.7679867 to get it +-0.6127273 and how it +-1.0427647 I believe it +-1.3875618 to make it +-1.0579593 should make it +-1.5707034 Firstly it +-1.1704433 of living it +-1.3914112 , however it +-0.92679805 point where it +-0.6780335 often find it +-0.80527925 students find it +-0.6780335 or find it +-0.9452322 I feel it +-0.6777468 people feel it +-0.98160887 So it +-0.69641256 than hinder it +-0.91313225 and getting it +-0.99038965 to try it +-0.8668363 hopefully appreciate it +-0.5148094 I think it +-1.010676 not think it +-0.69641256 Everyone knows it +-0.7667724 , yet it +-0.5354339 time ... it +-0.8981341 personal opinion it +-0.5354339 to rush it +-1.1186665 Although it +-0.5354339 would deem it +-0.5354339 to repay it +-0.8386012 completely banning it +-0.5354339 because nowadays it +-2.5190015 I can +-0.8857135 The statement can +-0.89159304 reasons it can +-0.89159304 While it can +-1.791068 , it can +-1.4032463 and it can +-1.5351136 skills that can +-0.93774277 someone that can +-0.93774277 gains that can +-1.8358839 a job can +-1.2886233 part-time job can +-1.4210838 time job can +-0.8911071 studying full-time can +-2.050851 a student can +-2.0094023 the student can +-1.7493175 studies , can +-1.2527076 one , can +-2.1385717 , and can +-0.9478223 now and can +-0.9478223 manner and can +-1.3447253 of study can +-1.1133463 of studying can +-0.8657286 while studying can +-1.2298374 This experience can +-1.3494016 , students can +-1.1748081 and students can +-2.1378505 college students can +-0.9202135 believe students can +-2.562944 part time can +-0.94386005 This time can +-1.7207611 part-time jobs can +-1.3471756 time jobs can +-0.880733 Student jobs can +-0.83394766 that one can +-1.122046 , one can +-0.83394766 where one can +-1.4856735 at school can +-1.2161113 course work can +-1.6249113 hard work can +-0.95404977 what money can +-1.0187029 that they can +-0.8910716 , they can +-1.329574 as they can +-1.5551031 if they can +-0.5060517 so they can +-1.049392 people they can +-1.2729213 where they can +-1.581983 that you can +-0.8782071 These experiences can +-0.6974243 academic qualification can +-0.80805176 service organization can +-1.1600474 the environment can +-1.9643993 There can +-0.9342319 whose parents can +-0.94889486 that people can +-1.5617176 They can +-1.3956043 This can +-1.1213467 People can +-0.99214834 this area can +-0.80805176 customer nor can +-1.3857073 It can +-1.0593362 and colleges can +-0.9247814 how can +-0.9493795 enjoying life can +-0.8350799 the result can +-0.74106234 experience which can +-0.8920372 skills which can +-0.8920372 something which can +-0.74106234 responsibilities which can +-1.069079 a family can +-0.8058585 every family can +-1.0320432 this period can +-0.885046 qualified professional can +-0.87611413 If someone can +-0.9306293 that matter can +-0.8350799 and back can +-1.0019507 Jobs can +-1.4736953 a person can +-0.536132 added self-esteem can +-0.8350799 and drinking can +-0.6974243 Such connections can +-0.6974243 's coursework can +-0.9033973 skills he can +-0.8934511 young age can +-0.8962261 money we can +-0.8962261 what we can +-0.536132 few bucks can +-0.6974243 The damage can +-0.6974243 a professor can +-1.0985031 at large can +-0.536132 shop owners can +-0.536132 Idle minds can +-0.6974243 These individuals can +-0.6974243 This relief can +-0.6974243 and bars can +-1.6872208 to smoke can +-0.536132 Nothing can +-0.536132 Smokers can +-0.9613249 can it be +-0.5904652 statement can be +-0.75249493 it can be +-0.44796956 that can be +-0.8953602 job can be +-0.5904652 full-time can be +-0.3375865 , can be +-0.3375865 study can be +-0.3375865 time can be +-0.5904652 school can be +-0.69151884 work can be +-0.5904652 qualification can be +-0.5904652 There can be +-0.731204 This can be +-0.5904652 area can be +-0.44796956 It can be +-0.75249493 which can be +-0.5904652 bucks can be +-0.5904652 damage can be +-0.5904652 bars can be +-2.3161147 a student be +-0.97020364 that will be +-0.97396755 job will be +-0.93338394 student will be +-0.91396713 , will be +-0.8280984 there will be +-0.9910503 students will be +-0.93338394 jobs will be +-0.91396713 college will be +-0.6379098 they will be +-0.8280984 employers will be +-0.6949426 There will be +-0.6949426 families will be +-0.8280984 we will be +-1.545812 course , be +-0.9649224 interests , be +-1.2003063 can often be +-0.55801034 can not be +-1.1179022 will not be +-0.67315096 may not be +-0.806374 otherwise not be +-0.7118058 should not be +-0.806374 might not be +-0.98768264 would not be +-0.806374 probably not be +-1.7056035 education and be +-1.2798102 reasons to be +-1.4236912 is to be +-1.5023708 student to be +-0.78237873 and to be +-1.5282325 time to be +-1.2119194 necessary to be +-0.7872396 enough to be +-1.4546759 need to be +-1.4069595 learn to be +-1.1015195 lessons to be +-0.87791914 years to be +-0.87791914 lesson to be +-1.2798102 means to be +-0.42729965 needs to be +-0.5938167 likely to be +-1.1015195 try to be +-1.1015195 responsibility to be +-0.87791914 wisdom to be +-0.87791914 employees to be +-0.87791914 shown to be +-0.81793046 can also be +-0.52540714 will also be +-0.7314308 may also be +-0.38945368 should also be +-0.7314308 must also be +-0.87846375 would also be +-1.4205849 first time be +-0.41237834 it may be +-0.46266088 experience may be +-0.8505125 students may be +-0.6157012 but may be +-0.6157012 college may be +-0.78964293 they may be +-0.6157012 do may be +-0.6157012 income may be +-0.6157012 workplace may be +-0.93739694 would otherwise be +-0.34155363 it should be +-0.48884022 that should be +-0.4182371 student should be +-0.4227813 study should be +-0.4227813 studying should be +-0.4227813 restaurant should be +-0.6256627 students should be +-0.48884022 work should be +-0.4227813 such should be +-0.4201718 they should be +-0.4227813 people should be +-0.52606475 They should be +-0.26153165 College should be +-0.26153165 It should be +-0.26153165 life should be +-0.4227813 courses should be +-0.52606475 Students should be +-0.4227813 Jobs should be +-0.5339085 we should be +-0.4227813 deviation should be +-0.4227813 activity should be +-0.13931742 smoking should be +-0.4227813 Smoking should be +-0.4227813 tobacco should be +-1.0153741 can only be +-1.1557078 will only be +-1.0153741 should only be +-1.5583255 To be +-0.8722903 not even be +-1.0921409 may even be +-0.9345244 must first be +-1.0170925 can ft be +-0.82552093 won ft be +-0.5660162 this could be +-0.6977501 that could be +-0.6608859 study could be +-0.5660162 studies could be +-0.5660162 jobs could be +-0.5660162 solution could be +-0.5660162 Which could be +-0.9539855 not all be +-0.6007683 must be +-0.7455161 student must be +-0.6007683 There must be +-0.6007683 Workers must be +-0.7045622 Students must be +-1.1442492 wo n't be +-0.8583598 what might be +-0.6041735 it would be +-0.57421464 , would be +-0.38924602 study would be +-0.4950434 there would be +-0.6041735 students would be +-0.4950434 then would be +-0.57421464 It would be +-0.4950434 finances would be +-0.4950434 store would be +-0.29627293 Students would be +-0.4950434 choice would be +-0.57421464 restaurants would be +-0.88403475 can however be +-0.8911713 will never be +-1.1233201 not just be +-0.5388479 could preciously be +-0.4402455 will always be +-0.8715348 should actually be +-0.8715348 allow us be +-0.83942044 will certainly be +-0.5388479 ` t be +-0.70136726 misogyny \ be +-2.066828 can be argued +-1.8067875 also be argued +-1.5056869 I disagree that +-0.7021219 the statement that +-1.4723728 job for that +-0.94441175 full-time for that +-0.93551975 my reasons that +-1.1957684 to it that +-1.4310732 makes it that +-0.31214613 be argued that +-0.95242125 points that that +-1.1618578 a job that +-1.8034397 part-time job that +-1.3095354 full-time job that +-1.4749159 time job that +-1.1752357 first job that +-1.3108716 this is that +-1.7471848 it is that +-1.0021493 , is that +-0.89828086 reason is that +-0.5865438 jobs is that +-1.0735788 school is that +-1.0735788 way is that +-0.5865438 point is that +-0.861007 here is that +-1.2383459 smoking is that +-2.1362226 a student that +-0.92819893 on , that +-1.1892664 then , that +-1.7036616 money , that +-0.92819893 say , that +-1.5355258 however , that +-1.1892664 opinion , that +-1.3287644 Furthermore , that +-0.92819893 demanding , that +-1.8356706 is not that +-1.2305553 the case that +-0.917871 and employment that +-1.9181563 job in that +-1.6796713 , and that +-1.4232886 experience and that +-0.9292705 compounded and that +-0.9292705 places and that +-1.3960968 top of that +-0.8246526 a nature that +-1.22666 unrelated to that +-0.94829816 adding to that +-0.94829816 admitted to that +-1.4852856 of course that +-0.93478733 hard study that +-0.81595004 acquire experience that +-1.0023046 and experience that +-1.300666 work experience that +-0.81595004 worthwhile experience that +-1.4007254 this reason that +-1.5652962 that students that +-1.447254 many students that +-0.8704389 some students that +-1.2575127 gives students that +-0.89845586 A students that +-1.4849237 their studies that +-1.5279263 study time that +-1.4436386 free time that +-1.0785748 part-time jobs that +-0.87164164 in jobs that +-0.87164164 The jobs that +-1.1993265 is one that +-1.201165 particular school that +-1.7695034 part-time work that +-1.2004569 Part-time work that +-0.8999258 benefits being that +-0.9374747 decide if that +-0.86349785 epeople-skills f that +-0.93368185 is such that +-0.92375785 dishes but that +-0.5980288 I agree that +-0.86938024 strongly agree that +-1.8414453 the money that +-0.4420721 to say that +-0.4420721 people say that +-0.4420721 would say that +-1.568508 And that +-0.93128175 education so that +-0.5303195 the advertisements that +-1.0638313 part-time positions that +-0.797243 out ways that +-0.8676183 The experiences that +-0.864944 Some benefits that +-0.79972905 valuable skill that +-0.9127906 far better that +-0.82254535 or industry that +-0.7490672 with skills that +-0.7490672 The skills that +-0.7490672 life skills that +-0.7490672 countless skills that +-0.7490672 learn skills that +-1.9959812 is important that +-1.5975901 very important that +-1.1361053 an environment that +-0.5303195 students complain that +-0.9144017 justified enough that +-1.3134401 the food that +-1.3870778 the idea that +-1.0692879 your parents that +-0.85837185 our parents that +-0.85662746 interpersonal relationships that +-1.5194669 of people that +-0.8533913 they see that +-1.0124757 to show that +-0.44295323 is something that +-0.58198553 studying something that +-0.71952224 do something that +-0.7521441 to know that +-0.97518206 n't know that +-0.75809586 The opportunities that +-1.5203687 of all that +-1.7112169 to get that +-0.84210014 the club that +-0.9157236 genuine activities that +-0.8533913 unmotivated workers that +-1.1034592 the demands that +-0.35404378 I believe that +-0.26092857 also believe that +-0.42157966 ft believe that +-0.13904124 strongly believe that +-0.42157966 really believe that +-0.42157966 truly believe that +-0.42157966 firmly believe that +-0.7197185 fs life that +-1.1476376 about life that +-1.2793947 additional responsibilities that +-0.75809586 more aware that +-0.84027153 only factor that +-1.4062219 work force that +-1.181169 work world that +-0.5303195 the consequence that +-0.9221272 pointed out that +-0.9163531 and energy that +-0.86349785 over someone that +-0.8550063 and feel that +-0.75809586 in subjects that +-1.2604542 more than that +-0.84074193 story than that +-0.5303195 to imply that +-0.87283367 fs just that +-0.9163531 a lesson that +-0.3523021 I understand that +-0.62763685 not understand that +-0.62763685 also understand that +-0.6890228 of power that +-0.82254535 a system that +-0.84210014 the effect that +-0.5303195 We hope that +-0.5303195 it follows that +-1.0485353 health problems that +-0.5303195 unfathomable waters that +-0.6457592 , things that +-0.7625431 other things that +-0.6457592 difficult things that +-0.6457592 out things that +-1.5812865 Students that +-0.6578134 I think that +-0.5149696 also think that +-0.5149696 you think that +-0.5149696 who think that +-0.30532625 ft think that +-0.67113674 , anything that +-0.67113674 learn anything that +-0.6890228 people argue that +-1.2010198 is why that +-1.1517092 doing homework that +-0.75809586 eat lunch that +-0.38642544 to realize that +-0.38642544 them realize that +-0.38642544 Japan realize that +-0.75809586 may discover that +-0.5303195 by saying that +-0.27384815 the fact that +-1.0347471 the opinion that +-0.7993897 my opinion that +-0.82254535 a policy that +-0.5303195 potential gains that +-0.6890228 many points that +-0.31214613 my belief that +-0.6890228 no doubt that +-1.0993952 With that +-0.5303195 same field\/industry that +-0.9207234 it seems that +-0.6890228 plainly seen that +-0.82254535 have employees that +-0.5303195 employer feels that +-0.5303195 These structures that +-0.5303195 good life-lesson that +-0.5303195 fs self that +-0.5303195 highly recommend that +-0.5303195 and burdens that +-0.5303195 character-building aspect that +-0.7611268 be said that +-0.5303195 also seemed that +-0.5303195 the media that +-0.6890228 so popular that +-0.5303195 Assuming that +-0.6890228 the goals that +-0.5303195 ceaseless cogitation that +-0.5303195 should assume that +-0.5303195 have heard that +-0.96308273 associated with having +-1.9715892 , for having +-1.6053296 reason for having +-1.2441112 argued that having +-2.099288 believe that having +-0.9601871 provides is having +-0.9601871 challenge is having +-1.2090404 that , having +-1.9032536 job , having +-1.4541466 course , having +-1.7609243 Firstly , having +-1.239839 Secondly , having +-1.6728699 First , having +-1.4541466 hand , having +-0.9389273 dimension , having +-1.2554793 of not having +-0.95593876 money by having +-1.3819263 study and having +-1.6737382 studies and having +-0.94856316 huge and having +-1.3819263 class and having +-1.54152 experience of having +-1.54152 importance of having +-1.5279936 years of having +-0.93955064 practice of having +-1.4561914 instead of having +-0.6302723 benefits to having +-2.6743803 college students having +-0.9450255 salary but having +-1.5244596 I believe having +-1.1377846 Not having +-0.81270057 environment without having +-0.81270057 earn without having +-0.8855846 choose between having +-1.1889553 not think having +-0.8747608 of actually having +-0.9463183 up with a +-0.77868885 fill with a +-0.9463183 cope with a +-1.0205005 out with a +-0.77868885 those with a +-0.77868885 schedule with a +-0.5973811 dealing with a +-0.77868885 communicating with a +-1.1257546 statement for a +-1.0040727 preparation for a +-0.9006014 studying for a +-1.0040727 working for a +-0.9006014 school for a +-1.0280923 money for a +-0.9006014 foundation for a +-1.6716337 important for a +-1.1257546 them for a +-0.7470942 fit for a +-0.7470942 day for a +-0.7470942 handle for a +-0.7470942 conditions for a +-0.9006014 right for a +-0.9667752 look for a +-0.7470942 expense for a +-0.7470942 options for a +-0.7470942 Searching for a +-0.7470942 ideal for a +-0.94060874 how can a +-0.96860355 can be a +-1.3518808 to be a +-0.69459635 also be a +-1.2592882 may be a +-1.2321413 should be a +-0.9826158 ft be a +-1.1768022 must be a +-0.8030299 however be a +-0.8030299 us be a +-1.6434534 is that a +-1.2846122 say that a +-1.0817838 believe that a +-0.91010344 heard that a +-0.44783682 for having a +-0.24339184 that having a +-0.44783682 is having a +-0.19811971 , having a +-0.47444984 of having a +-0.24339184 to having a +-0.24339184 believe having a +-0.38733426 between having a +-0.38733426 think having a +-0.38733426 actually having a +-2.2963576 part-time job a +-0.97721606 this is a +-0.98941046 it is a +-1.0355538 that is a +-1.0525014 job is a +-0.71169424 working is a +-0.5697588 there is a +-0.9959183 time is a +-0.85102165 school is a +-0.85102165 only is a +-0.9420672 money is a +-0.71169424 option is a +-1.0484575 There is a +-0.71383274 This is a +-0.68053716 College is a +-0.71169424 done is a +-0.6859554 It is a +-0.96281344 life is a +-0.71169424 cards is a +-1.0023357 which is a +-0.85102165 way is a +-0.90955126 education is a +-0.71169424 ethics is a +-0.71169424 burgers is a +-0.71169424 made is a +-0.71169424 Failure is a +-0.71169424 restaurants is a +-0.6005122 smoke is a +-0.71169424 premise is a +-2.0335855 the student a +-1.5864897 job , a +-0.8860731 case , a +-1.0490814 example , a +-1.4342254 Therefore , a +-1.2297088 tuition , a +-1.0716677 Firstly , a +-1.1152519 Often , a +-1.1504664 Secondly , a +-1.3495991 Thirdly , a +-1.1152519 Lastly , a +-1.2297088 Japan , a +-1.454469 First , a +-1.1152519 independence , a +-1.2297088 Second , a +-1.3006831 hand , a +-0.8860731 front , a +-0.8860731 Consequently , a +-0.90565085 fs often a +-1.5734345 is not a +-1.588621 are not a +-0.8980127 If not a +-1.810451 In a +-0.7823678 the cases a +-0.7823678 certain cases a +-1.053841 obtained by a +-0.84880257 entirely by a +-0.84880257 subsidized by a +-0.718338 be in a +-0.8602051 part-time in a +-0.718338 studying in a +-0.3399794 working in a +-0.718338 waiter in a +-1.0982568 students in a +-0.92007583 on in a +-1.0351849 time in a +-0.718338 one in a +-0.718338 resulting in a +-0.6643399 work in a +-0.5186932 well in a +-0.83076644 are in a +-0.718338 whether in a +-0.718338 skills in a +-0.718338 position in a +-0.718338 spend in a +-0.8602051 graduates in a +-0.8602051 period in a +-0.718338 factors in a +-0.718338 someone in a +-0.8602051 participation in a +-0.8602051 Working in a +-0.718338 dishes in a +-0.8602051 point in a +-0.718338 mistakes in a +-0.718338 Being in a +-1.4662038 job and a +-1.7221913 , and a +-1.3409965 school and a +-1.1411275 others and a +-1.1411275 expenses and a +-1.1411275 professional and a +-0.9011515 hours and a +-1.1187395 that of a +-0.8067825 and of a +-0.8067825 less of a +-1.3970195 part of a +-0.41296336 demands of a +-0.9883029 ability of a +-0.9883029 burden of a +-1.383051 cost of a +-0.73467326 importance of a +-1.6377094 value of a +-0.8067825 determination of a +-1.1187395 lack of a +-0.66171676 purpose of a +-0.9883029 waste of a +-0.8067825 frustration of a +-1.1187395 members of a +-0.8067825 manager of a +-0.8067825 path of a +-0.8067825 pressure of a +-0.8067825 acquisition of a +-0.8067825 minimum of a +-1.1187395 effects of a +-1.5717438 important to a +-0.90442413 keeping to a +-1.0972198 lead to a +-1.1853063 going to a +-1.5366107 go to a +-0.90442413 given to a +-0.90442413 assistant to a +-1.350067 due to a +-0.90442413 supplementary to a +-0.90442413 conducive to a +-1.1468302 opposed to a +-0.7913871 student has a +-0.7913871 individual has a +-0.7913871 air has a +-0.9336654 may experience a +-0.75180066 can gain a +-0.79771787 and gain a +-0.6395066 to gain a +-0.84516895 working from a +-1.0480295 benefit from a +-0.84516895 come from a +-0.84516895 comprehend from a +-1.079625 that working a +-0.8504998 , working a +-1.0111625 by working a +-0.76166815 from working a +-1.0645251 By working a +-0.7733132 ever working a +-0.5868115 part-time as a +-0.686913 job as a +-0.686913 experience as a +-0.5868115 working as a +-0.5868115 you as a +-0.686913 or as a +-0.5868115 themselves as a +-0.5868115 position as a +-0.5868115 times as a +-0.5868115 society as a +-0.5868115 generated as a +-0.5868115 interests as a +-0.5868115 come as a +-0.5868115 Working as a +-0.5868115 serve as a +-0.5868115 days as a +-0.5868115 together as a +-0.5868115 seen as a +-0.5868115 finally as a +-0.5868115 hobbies as a +-1.3226923 gives students a +-1.3226923 give students a +-1.4313142 allows students a +-1.0391068 Such a +-0.6833472 often mean a +-1.1533704 it on a +-1.1533704 job on a +-1.2955194 , on a +-0.42515737 takes on a +-0.8498009 invaluable on a +-0.75526106 is also a +-0.8914721 also also a +-0.9740218 to accomplish a +-1.4891031 to work a +-0.8404194 then work a +-0.8404194 only work a +-0.8404194 To work a +-0.8404194 through work a +-0.8404194 hand work a +-1.0818121 of being a +-0.78838515 from being a +-1.1081296 that if a +-0.86440694 , if a +-0.8267235 desirable if a +-0.667964 having such a +-0.667964 find such a +-0.667964 learn such a +-0.667964 behoove such a +-0.667964 implementing such a +-1.5998961 , then a +-0.8322533 than pursuing a +-0.89810765 ft pay a +-0.8559058 can have a +-0.9151446 that have a +-0.73738813 will have a +-0.493793 not have a +-0.202698 to have a +-1.0427289 students have a +-0.7152325 also have a +-0.6936333 should have a +-0.8559058 only have a +-0.9151446 you have a +-0.96914184 who have a +-0.99588823 ft have a +-0.7152325 fll have a +-0.9156101 gives you a +-0.85122377 of making a +-1.0795275 world following a +-1.4467794 To a +-0.8407426 , become a +-0.509442 to become a +-0.6833472 not guarantee a +-1.2733552 job or a +-1.4723481 , or a +-0.86183906 family or a +-1.074937 company or a +-1.4762473 , even a +-1.3461058 there are a +-1.3131471 jobs are a +-1.3935848 you are a +-1.6553884 There are a +-1.093631 finances are a +-0.5645089 As a +-0.5930753 , As a +-0.86464363 could offer a +-0.9243449 or more a +-0.6341425 of whether a +-0.6341425 decide whether a +-0.9422763 students taking a +-0.775937 since taking a +-0.82342994 met through a +-0.82342994 Working through a +-0.6866503 directly into a +-0.816869 insight into a +-0.6866503 converted into a +-0.6866503 turn into a +-0.704525 be at a +-0.704525 little at a +-0.704525 usually at a +-0.704525 week at a +-0.704525 home at a +-0.704525 look at a +-0.704525 me at a +-0.704525 dealing at a +-0.12762828 Having a +-0.9149432 ft need a +-0.89223915 seem like a +-1.1636549 , doing a +-1.1636549 and doing a +-0.9324855 even when a +-0.72627777 to earn a +-0.8949397 to take a +-0.81632745 you take a +-0.8391622 last chance a +-1.1008759 will know a +-0.9243657 appreciate what a +-0.99457824 gives them a +-1.1935583 give them a +-0.99457824 make them a +-0.81090385 giving them a +-0.5101422 can provide a +-0.83762616 to provide a +-0.83762616 also provide a +-0.8637705 If a +-0.48962295 can get a +-0.58585244 will get a +-0.61665446 and get a +-0.5855448 to get a +-0.50472003 should get a +-0.5167513 they get a +-0.50472003 could get a +-0.58585244 them get a +-0.50472003 ultimately get a +-0.50472003 actually get a +-1.4726713 I believe a +-1.0727439 are entering a +-1.04039 , particularly a +-0.91565967 picking up a +-0.9063969 and make a +-1.2048371 while still a +-1.1440232 is quite a +-0.812417 Maintaining a +-0.8228196 of managing a +-0.37295055 at least a +-0.85504335 having been a +-0.8244397 work during a +-0.8244397 lifestyle during a +-0.82537234 , earning a +-0.6929347 and earning a +-0.6418105 to lead a +-0.95612603 may lead a +-1.0817583 to enjoy a +-0.7589079 them enjoy a +-0.8558506 universities once a +-0.6833472 businesses require a +-0.86291385 may force a +-1.1899884 tertiary education a +-0.78997475 in balancing a +-0.38434345 , finding a +-0.38434345 in finding a +-0.38434345 of finding a +-0.6341425 job provides a +-0.6341425 experience provides a +-0.81484497 help improve a +-0.812417 Without a +-0.8569548 should place a +-0.84690464 taking out a +-0.84690464 filling out a +-1.1576419 is where a +-0.7514494 , giving a +-0.8857684 to find a +-0.7899939 should find a +-0.812417 Should a +-0.78997475 It requires a +-0.8986702 adequately support a +-0.81484497 will last a +-0.7514494 start building a +-1.3197943 they learn a +-0.5263734 to pursue a +-0.9049061 less than a +-1.1678841 to enter a +-0.31040567 to secure a +-1.2287464 to meet a +-1.1969007 I worked a +-0.85338986 studying costs a +-0.5620173 for getting a +-0.5620173 , getting a +-0.5620173 to getting a +-0.5620173 just getting a +-0.81484497 should hold a +-0.94065803 few hours a +-0.774833 40 hours a +-0.886487 5 days a +-0.7554748 high risk a +-0.5315993 that adding a +-0.45924574 By adding a +-0.6833472 to securing a +-0.7514494 I studied a +-0.66605204 student obtain a +-0.8845122 to obtain a +-0.7514494 studies -- a +-0.6833472 it instils a +-0.5263734 and opens a +-0.7514494 have yet a +-0.7514494 for planning a +-0.812417 Taking a +-0.7514494 on developing a +-0.7514494 , sometimes a +-0.6833472 fs generations a +-1.0165938 On a +-1.258421 to balance a +-0.22965625 in becoming a +-0.22965625 and becoming a +-0.22965625 to becoming a +-0.22965625 fast becoming a +-1.1678841 to develop a +-1.246687 When a +-0.9068164 to receive a +-1.0924006 I had a +-0.7937406 not had a +-0.78997475 If given a +-0.6833472 few nights a +-0.5930753 work within a +-0.5930753 together within a +-0.6833472 it profit a +-0.45924574 This puts a +-0.45924574 institutes puts a +-0.86464363 made me a +-0.6833472 on seeing a +-0.78997475 little over a +-0.5263734 already experienced a +-0.53794885 in creating a +-0.53794885 even creating a +-0.5263734 to gauge a +-0.5263734 have devised a +-0.5263734 student deserves a +-0.6833472 , purchase a +-0.6833472 to demand a +-0.5263734 , seeking a +-0.6833472 can challenge a +-0.812417 Doing a +-0.45924574 world obtaining a +-0.45924574 without obtaining a +-0.5263734 greatest assets a +-0.6833472 income supporting a +-0.6833472 job creates a +-0.5263734 it yields a +-0.6833472 Yet a +-0.5263734 will deter a +-0.5263734 job poses a +-0.45924574 hold down a +-0.45924574 holding down a +-0.5263734 and landing a +-0.5263734 to train a +-0.5263734 to forgo a +-0.5263734 ever known a +-0.5263734 to flip a +-0.5263734 to sustain a +-0.45924574 that holding a +-0.45924574 of holding a +-0.5263734 to supply a +-0.5263734 and hold-down a +-0.5263734 To coin a +-0.5263734 Obliging a +-0.5263734 Implementing a +-0.6833472 does affect a +-0.6833472 like inhaling a +-0.5263734 passively contracting a +-0.95469683 arrange for part-time +-0.95469683 availability for part-time +-0.8965591 had several part-time +-1.2854022 say that part-time +-0.91043675 experiences that part-time +-1.157392 belief that part-time +-0.91043675 doubt that part-time +-0.91043675 aspect that part-time +-0.9394996 students having part-time +-1.1435484 that a part-time +-0.20435277 having a part-time +-1.0257795 is a part-time +-0.66151166 , a part-time +-1.1086023 in a part-time +-1.0791117 and a part-time +-1.2424343 of a part-time +-0.90821856 working a part-time +-1.2293094 if a part-time +-0.7576475 then a part-time +-0.36055943 have a part-time +-1.0233345 or a part-time +-0.7576475 even a part-time +-0.9157078 whether a part-time +-0.28305605 Having a part-time +-0.9157078 doing a part-time +-0.53855455 take a part-time +-1.3084906 If a part-time +-1.2766547 get a part-time +-0.7576475 Maintaining a part-time +-0.7576475 managing a part-time +-0.7576475 where a part-time +-0.7576475 worked a part-time +-0.63068616 getting a part-time +-0.7576475 hold a part-time +-0.7576475 risk a part-time +-0.9157078 adding a part-time +-0.7576475 Taking a part-time +-0.7576475 sometimes a part-time +-0.9157078 had a part-time +-0.7576475 seeking a part-time +-0.7576475 Doing a part-time +-0.9157078 down a part-time +-0.39795253 holding a part-time +-0.7576475 hold-down a part-time +-2.2820241 , the part-time +-2.027672 at the part-time +-1.3858186 Moreover , part-time +-1.8317215 However , part-time +-0.9499941 subject , part-time +-0.9499941 scale , part-time +-0.78769267 Finally , part-time +-2.0259588 are not part-time +-1.5639628 , many part-time +-0.9467805 , any part-time +-0.9467805 from any part-time +-0.77900296 against any part-time +-0.7522525 engage in part-time +-1.7522188 studies and part-time +-1.5293045 experience of part-time +-1.3488836 type of part-time +-0.9360869 balance of part-time +-0.9360869 favor of part-time +-1.4449126 form of part-time +-0.96571416 look to part-time +-0.9614904 select their part-time +-1.2798676 , working part-time +-0.81824607 course working part-time +-1.0920655 students working part-time +-0.81824607 Students working part-time +-0.81824607 currently working part-time +-0.87553394 I work part-time +-1.206789 can work part-time +-1.6209877 to work part-time +-0.87553394 therefore work part-time +-1.206789 days work part-time +-0.77080154 but otherwise part-time +-0.9430868 get only part-time +-1.2058669 that have part-time +-1.584675 not have part-time +-1.0177239 to have part-time +-1.4711833 students have part-time +-0.79304624 should have part-time +-1.6544564 And part-time +-1.1194972 to offer part-time +-1.531082 These part-time +-1.1896515 learned through part-time +-1.2971555 their first part-time +-1.0655015 my first part-time +-2.053652 This part-time +-1.3868866 we do part-time +-1.6261486 to take part-time +-0.6998367 one works part-time +-0.85558045 A part-time +-0.834762 to fill part-time +-1.6502413 their own part-time +-1.595122 to find part-time +-1.1046036 The best part-time +-1.4616855 so much part-time +-1.2503716 I worked part-time +-0.9177244 than getting part-time +-0.6998367 their so-called part-time +-0.96171606 Working part-time +-0.68080556 : Working part-time +-0.5377946 an average part-time +-1.1277791 Although part-time +-0.77080154 an appropriate part-time +-1.171514 students had part-time +-0.5377946 conveniently located part-time +-0.5377946 to take-up part-time +-0.6998367 often seek part-time +-0.6998367 very lucrative part-time +-1.3332454 with a job +-1.3843993 for a job +-1.0557996 having a job +-1.504018 , a job +-1.4567358 in a job +-1.2752383 and a job +-0.8983592 of a job +-1.0527124 to a job +-0.4225447 Such a job +-1.2280369 such a job +-0.8403227 guarantee a job +-1.190565 or a job +-1.5974905 Having a job +-0.54030913 get a job +-0.5774287 finding a job +-1.0403234 adding a job +-1.0403234 obtain a job +-0.8403227 balance a job +-0.8403227 deserves a job +-1.0403234 down a job +-0.015545427 a part-time job +-0.64203715 the part-time job +-0.4241638 any part-time job +-0.64203715 in part-time job +-0.81574017 of part-time job +-0.32103842 first part-time job +-0.5508023 This part-time job +-0.16555339 A part-time job +-0.5508023 own part-time job +-0.5508023 best part-time job +-0.5508023 average part-time job +-0.5508023 Although part-time job +-0.5508023 appropriate part-time job +-0.5508023 located part-time job +-0.34320134 a full-time job +-1.9574244 for the job +-1.4286005 be the job +-1.9329262 in the job +-1.7634121 of the job +-0.93096656 do the job +-1.1943283 get the job +-0.93096656 keep the job +-0.93096656 harder the job +-0.8704682 in any job +-1.089122 or any job +-1.4195447 type of job +-1.6226106 kind of job +-0.9679123 translate to job +-2.363676 of their job +-0.25140214 part time job +-1.673732 full time job +-0.9007268 any available job +-0.94985986 do one job +-1.3828673 one 's job +-0.8588519 today 's job +-1.4148422 a career job +-1.4871591 Part-time job +-2.486466 a part job +-0.5394636 carefully selecting job +-1.689819 the same job +-0.7735984 their first job +-0.85916185 your first job +-1.1118029 a good job +-0.91738397 increasing your job +-0.94946533 realize what job +-1.2931173 a real job +-1.7496572 real world job +-1.0415673 a particular job +-0.9065895 , against job +-0.7743107 the simple job +-0.8728603 lose her job +-0.77365947 A research job +-1.1673927 for this is +-0.8032141 , this is +-0.9380627 and this is +-0.9380627 but this is +-0.7730598 why this is +-0.2349328 that it is +-0.5708096 student it is +-0.3123905 , it is +-0.7451785 and it is +-0.32978565 as it is +-0.5708096 students it is +-0.6668576 if it is +-0.5014664 but it is +-0.5708096 agree it is +-0.5708096 However it is +-0.77837354 what it is +-0.3906384 because it is +-0.5708096 them it is +-0.70425487 If it is +-0.5708096 whatever it is +-0.3294959 how it is +-0.16911975 believe it is +-0.6668576 feel it is +-0.15740287 think it is +-0.5708096 knows it is +-0.5708096 ... it is +-0.5708096 opinion it is +-0.5708096 Although it is +-0.8915361 job that is +-0.8130382 nature that is +-0.8130382 school that is +-0.9978403 work that is +-1.3598235 agree that is +-0.8130382 environment that is +-0.8130382 food that is +-0.8130382 relationships that is +-0.8130382 all that is +-0.8130382 club that is +-0.8130382 force that is +-0.8130382 cogitation that is +-1.780805 a job is +-1.1376543 part-time job is +-1.606921 time job is +-0.7926138 a student is +-1.2827873 the student is +-1.4707285 college student is +-0.8607828 rounded student is +-1.365294 job , is +-1.5570047 student , is +-1.743601 time , is +-1.6351857 work , is +-1.1817294 others , is +-1.407212 activities , is +-1.5189708 however , is +-0.92405105 obvious , is +-0.92405105 firstly , is +-1.5458007 or not is +-1.6212186 time and is +-1.3467845 their study is +-0.88333553 , studying is +-0.88333553 are studying is +-0.7441733 while studying is +-0.73489785 spent studying is +-0.9082485 Such experience is +-1.1535352 college experience is +-1.5482816 that working is +-0.8347286 a restaurant is +-1.0185652 the restaurant is +-0.4379831 While there is +-0.43895334 , there is +-0.4379831 as there is +-0.50659114 but there is +-0.4379831 However there is +-0.4379831 If there is +-0.4379831 Firstly there is +-0.4379831 feel there is +-0.4379831 think there is +-0.14602639 first reason is +-0.86209184 second reason is +-0.71969813 obvious reason is +-2.4764078 college students is +-1.1949867 from studies is +-1.2425061 a time is +-1.2139168 , time is +-1.247492 their time is +-1.9624256 part time is +-0.8116826 your time is +-0.9957675 much time is +-0.8116826 in-class time is +-0.8116826 His time is +-1.5357331 part-time jobs is +-1.842247 time jobs is +-0.9341389 no one is +-1.1227734 , school is +-1.1227734 to school is +-0.9496205 such work is +-0.864055 world f is +-0.9242996 choice but is +-1.464019 not only is +-0.87759566 Not only is +-1.442007 is necessary is +-1.2141523 , college is +-1.1866114 in college is +-0.8606727 ; college is +-1.4959836 after college is +-1.2795157 during college is +-0.5035019 Time is +-0.9109033 extra money is +-0.8821976 Extra money is +-1.108703 prize money is +-1.4986432 , so is +-1.9099857 , or is +-0.53057814 Another option is +-1.3366545 to consider is +-0.9406554 also important is +-0.9328752 worker who is +-0.89885783 This environment is +-0.84503543 There is +-1.1244345 , food is +-0.8205782 The alternative is +-0.61668825 This is +-0.8544151 f. This is +-0.69736004 College is +-0.71964127 , College is +-0.5776416 your major is +-1.1546215 the classes is +-0.4986766 , what is +-1.29677 of what is +-0.824929 out what is +-1.1551659 they provide is +-0.6104731 Japanese society is +-1.1854899 have done is +-0.97827387 this area is +-0.8251142 high grades is +-0.3978547 It is +-1.1677258 these activities is +-1.1686045 , how is +-0.80614805 student life is +-0.9107202 in life is +-0.80614805 university life is +-0.9873396 his life is +-0.855516 for graduates is +-1.2335917 credit cards is +-0.8976511 the income is +-0.43953854 , which is +-0.33175752 of which is +-0.5762516 doing which is +-0.5762516 world which is +-0.5762516 system which is +-0.5762516 week which is +-0.5762516 smoking which is +-1.5117768 living expenses is +-0.9020047 Japanese family is +-1.0164603 this period is +-0.864055 independent adult is +-0.790864 this way is +-0.790864 no way is +-0.8407997 last factor is +-0.54310596 the situation is +-0.54310596 This situation is +-1.3806108 of education is +-1.0885909 college education is +-1.6492549 real world is +-0.88097453 achieve success is +-0.842589 working provides is +-0.80015355 because training is +-0.85366726 labor cost is +-0.7960297 That is +-0.68939555 this question is +-0.68939555 fs effectiveness is +-0.864055 go someone is +-1.7761021 to learn is +-1.3963182 too much is +-1.0621008 the problem is +-0.85710216 this change is +-0.7585329 work concerned is +-0.9859127 work ethics is +-0.53057814 Suffering is +-1.0410795 simple tasks is +-0.7977212 menial labor is +-0.53057814 But timing is +-0.9011831 Japan is +-1.0749522 main purpose is +-0.53057814 serving burgers is +-1.7563766 I think is +-0.864055 has made is +-0.68939555 Not everyone is +-0.7977212 Every hour is +-0.7895019 free day is +-0.7895019 whose day is +-0.68939555 with coworkers is +-0.8230523 their homework is +-0.53057814 and dinner is +-0.53057814 the contrary is +-0.8251142 where drinking is +-0.37824222 final point is +-0.6983885 My point is +-0.740988 or she is +-0.5975189 If she is +-0.92422605 my opinion is +-0.53057814 Studying is +-0.54310596 graduation careers is +-0.63256115 their careers is +-0.53057814 School is +-0.53057814 that profession is +-0.8407997 If everything is +-0.9859127 a resume is +-0.53057814 statement eIt is +-0.8230523 The goal is +-0.7977212 class schedule is +-0.9859127 after graduating is +-0.68939555 real challenge is +-0.53057814 place cwhich is +-0.68939555 a burger is +-0.90803343 their health is +-0.8230523 their mind is +-0.53057814 the academics is +-0.53057814 Failure is +-0.53057814 the administration is +-0.31225985 Money is +-0.53057814 Communication is +-1.5232872 in restaurants is +-0.53057814 glittering bounty is +-0.53057814 for enlightenment is +-0.53057814 The onus is +-0.7977212 problem here is +-1.1787051 that smoking is +-0.7853301 , smoking is +-0.5520076 ban smoking is +-0.97326475 second-hand smoke is +-0.75099576 tobacco smoke is +-0.53525615 Tobacco smoke is +-0.53057814 the premise is +-0.53057814 the employee is +-2.0926883 will be valuable +-1.8213264 be a valuable +-0.9416316 job a valuable +-2.025607 is a valuable +-1.4630669 gain a valuable +-1.4630669 also a valuable +-1.9092273 job is valuable +-1.2500579 Time is valuable +-1.0028375 to acquire valuable +-0.915174 students many valuable +-0.915174 them many valuable +-1.6528096 lot of valuable +-1.2254099 a very valuable +-1.3246421 are very valuable +-1.1837755 very little valuable +-1.1676499 can gain valuable +-1.4337367 give students valuable +-0.94152975 a -LRB- valuable +-2.4775994 to have valuable +-1.5177169 teaches them valuable +-1.4262344 jobs provide valuable +-1.2076858 can make valuable +-0.54052097 school builds valuable +-1.2122566 is valuable preparation +-1.5653945 experience and preparation +-1.2166305 is good preparation +-0.70534635 provide effective preparation +-1.5680081 with a full-time +-1.6587732 for a full-time +-1.7475666 is a full-time +-1.2768748 not a full-time +-1.4129009 are a full-time +-1.1510258 taking a full-time +-1.7085097 get a full-time +-1.3035802 finding a full-time +-1.3807868 getting a full-time +-1.3567858 becoming a full-time +-1.9482578 work and full-time +-0.96484303 opportunities of full-time +-1.2586243 responsibilities of full-time +-1.6492419 while studying full-time +-1.5482255 or even full-time +-0.7050885 start formal full-time +-0.78560084 , with the +-0.734856 and with the +-0.77892387 students with the +-0.42973036 agree with the +-0.69485074 relationships with the +-0.734856 help with the +-0.69485074 do with the +-0.338656 life with the +-0.5931033 day with the +-0.5931033 interacting with the +-0.5931033 helping with the +-0.5931033 habits with the +-0.5931033 interferes with the +-0.5931033 trapped with the +-0.5931033 interfering with the +-0.5931033 burdened with the +-0.5931033 coupled with the +-0.5931033 rests with the +-0.5931033 entangled with the +-0.6055535 for the +-0.49505532 statement for the +-0.7745824 job for the +-0.7745824 preparation for the +-0.9647335 , for the +-0.7745824 working for the +-0.83685935 time for the +-0.83965427 pay for the +-0.34365147 bad for the +-0.6055535 history for the +-0.7106483 better for the +-0.6055535 skills for the +-0.6055535 issue for the +-0.7106483 need for the +-0.5267676 good for the +-0.6055535 do for the +-0.49505532 them for the +-0.6055535 unprepared for the +-0.75220877 responsible for the +-0.6055535 '' for the +-0.6055535 great for the +-0.6055535 matter for the +-0.6055535 Working for the +-0.6055535 traverse for the +-0.6055535 paying for the +-0.75220877 responsibility for the +-0.6055535 budget for the +-0.6055535 distraction for the +-0.6055535 save for the +-0.6055535 policy for the +-0.34365147 prepare for the +-0.6055535 prepared for the +-0.6055535 empathy for the +-0.6055535 loan for the +-0.6055535 low for the +-0.6055535 dogs for the +-0.6055535 chances for the +-1.0766768 While the +-1.1211412 may be the +-1.7521027 should be the +-0.90107536 just be the +-0.9709294 argued that the +-1.2245541 job that the +-1.2437248 is that the +-1.2596142 , that the +-0.7952663 being that the +-0.7952663 enough that the +-1.135978 believe that the +-0.7952663 aware that the +-1.1742799 think that the +-1.049854 realize that the +-0.7952663 said that the +-0.7952663 media that the +-0.7952663 Assuming that the +-1.6814873 the job the +-1.2672942 , is the +-1.3552619 there is the +-0.8574259 reason is the +-0.8167592 studies is the +-1.1961966 college is the +-0.8167592 so is the +-1.3191751 This is the +-1.1726029 what is the +-1.3632811 It is the +-1.1726029 life is the +-0.8167592 factor is the +-1.003548 situation is the +-0.8167592 Studying is the +-0.8167592 enlightenment is the +-1.9053354 a student the +-1.8473952 the student the +-0.7857646 not acquire the +-0.81018215 it , the +-0.9934771 that , the +-1.123291 job , the +-1.1976409 student , the +-0.9934771 often , the +-1.2986002 example , the +-0.86197114 students , the +-1.2784339 time , the +-0.9934771 such , the +-1.2347344 -RRB- , the +-0.9934771 then , the +-1.2499646 college , the +-0.81018215 portals , the +-0.81018215 Primarily , the +-1.180603 skills , the +-0.9700261 However , the +-1.1578201 classes , the +-1.0770386 all , the +-0.8398835 life , the +-0.9934771 responsibilities , the +-0.9700261 Firstly , the +-0.81018215 prospects , the +-1.180603 however , the +-1.125741 addition , the +-0.9934771 years , the +-0.81018215 moreover , the +-0.81018215 perhaps , the +-0.81018215 mentioned , the +-1.125741 hand , the +-1.0770386 Thus , the +-1.0770386 Furthermore , the +-1.2721753 Finally , the +-0.81018215 Ultimately , the +-0.81018215 accounts , the +-0.81018215 accommodation , the +-0.81018215 luck , the +-0.81018215 nonetheless , the +-1.1399924 Too often the +-1.6641804 is not the +-0.91638875 often not the +-1.1014981 In the +-0.7241692 obtained by the +-0.61613256 up by the +-0.61613256 higher by the +-0.61613256 But by the +-0.61613256 made by the +-0.61613256 abide by the +-0.61613256 thin by the +-0.34782895 offered by the +-0.61613256 dashed by the +-0.9060609 , in the +-0.76231086 employment in the +-0.6455818 obtained in the +-0.89339256 and in the +-0.42620766 experience in the +-0.7600345 students in the +-0.8864258 engage in the +-0.6455818 studies in the +-0.70112026 time in the +-0.8511636 jobs in the +-0.6455818 school in the +-0.9376454 work in the +-0.6455818 but in the +-0.6455818 necessary in the +-0.6455818 college in the +-0.6455818 appear in the +-0.6455818 internships in the +-0.6455818 gaps in the +-0.76231086 better in the +-0.6455818 important in the +-0.76231086 learned in the +-0.6455818 like in the +-0.80941606 people in the +-0.76231086 help in the +-0.6455818 function in the +-0.76231086 get in the +-0.6455818 '' in the +-0.6455818 place in the +-0.76231086 Working in the +-0.6455818 harder in the +-0.6455818 hours in the +-0.6455818 both in the +-0.6455818 hierarchies in the +-0.6455818 reports in the +-0.6455818 live in the +-0.6455818 exist in the +-0.6455818 dancing in the +-0.6455818 tobacco in the +-1.1710322 the field the +-1.0918549 student and the +-1.1979501 , and the +-0.818134 employment and the +-1.0918549 study and the +-1.1423497 experience and the +-1.2173085 studies and the +-0.9468625 work and the +-1.0056634 -LRB- and the +-1.3268399 money and the +-0.818134 itself and the +-0.818134 factor and the +-1.0056634 government and the +-0.818134 law and the +-0.818134 independence and the +-1.0056634 clubs and the +-0.818134 classroom and the +-0.818134 interviews and the +-0.818134 paycheck and the +-0.818134 dollar and the +-0.818134 warnings and the +-0.7155632 job of the +-0.1756275 many of the +-0.57808876 experience of the +-0.45903027 one of the +-0.7576203 quality of the +-0.3086364 some of the +-0.45903027 most of the +-0.6094073 Most of the +-0.31178153 part of the +-0.6094073 foundation of the +-0.40940017 understanding of the +-0.7802819 knowledge of the +-0.6094073 environment of the +-0.7155632 issue of the +-0.6094073 Many of the +-0.34518042 parents of the +-0.8376671 all of the +-0.7155632 because of the +-0.6094073 unworthy of the +-0.79446906 lot of the +-0.6094073 reality of the +-0.6094073 aware of the +-0.6094073 expenses of the +-0.5295398 development of the +-0.7155632 ability of the +-0.79446906 sense of the +-0.82563585 world of the +-0.45903027 concept of the +-1.0122551 cost of the +-0.84292716 out of the +-0.6094073 relevance of the +-0.80419135 outside of the +-0.34518042 much of the +-0.6094073 problem of the +-0.6094073 workplace of the +-0.6094073 confines of the +-1.0075617 rest of the +-0.6094073 delights of the +-0.6094073 Students of the +-0.7576203 responsibility of the +-0.6094073 applications of the +-0.7802819 members of the +-0.6094073 rules of the +-0.79446906 kind of the +-0.6094073 dynamics of the +-0.6094073 side of the +-0.6094073 destiny of the +-0.6094073 abuse of the +-0.34518042 sight of the +-0.6094073 corners of the +-0.34518042 appreciation of the +-0.7576203 care of the +-0.6094073 possession of the +-0.7155632 One of the +-0.6094073 security of the +-0.6094073 beginning of the +-0.6623005 health of the +-0.7155632 pursuit of the +-0.6094073 none of the +-0.6094073 sufferers of the +-0.6094073 interference of the +-0.6094073 despite of the +-0.6094073 infringement of the +-0.6094073 smell of the +-0.9903222 it to the +-0.9903222 job to the +-1.1277498 , to the +-1.4303834 students to the +-0.7053112 related to the +-1.0452139 benefits to the +-0.9207419 more to the +-0.7611397 attractive to the +-1.136093 important to the +-0.9207419 themselves to the +-0.7611397 food to the +-0.39905858 useful to the +-0.9207419 get to the +-0.9903222 up to the +-0.9207419 regard to the +-0.7611397 education to the +-0.7611397 world to the +-0.7611397 campus to the +-0.7611397 interests to the +-0.7611397 pertaining to the +-0.7611397 relation to the +-0.39905858 relative to the +-0.6502585 addition to the +-0.7611397 herself to the +-0.9207419 getting to the +-0.7611397 further to the +-0.7611397 stability to the +-0.7611397 respect to the +-0.7611397 bring to the +-0.7611397 suited to the +-0.7611397 merit to the +-0.7611397 Looking to the +-0.9207419 goes to the +-0.7611397 adjust to the +-0.7611397 shock to the +-0.7611397 applied to the +-1.0297949 due to the +-0.7611397 introduced to the +-0.7611397 offered to the +-0.7611397 thrown to the +-0.7611397 distracting to the +-0.7611397 attracted to the +-0.9207419 opposed to the +-0.7611397 safe-guarded to the +-0.7611397 approach to the +-0.7611397 justification to the +-0.7611397 insult to the +-1.4370754 For the +-0.7857646 and engineering the +-1.425126 to experience the +-0.8832106 students gain the +-0.65722644 student from the +-0.65722644 but from the +-0.7776103 money from the +-0.7776103 benefit from the +-0.7776103 borrowed from the +-0.68857145 away from the +-0.48605403 transition from the +-0.65722644 scholarships from the +-0.7776103 escape from the +-0.65722644 view from the +-0.65722644 hide from the +-1.5586927 such as the +-0.9736256 well as the +-0.89750665 manageable as the +-1.5602933 of students the +-0.6360376 be on the +-0.67076755 job on the +-0.31974578 , on the +-0.54593325 not on the +-0.6360376 work on the +-0.54593325 are on the +-0.54593325 strains on the +-0.54593325 strongly on the +-0.54593325 depends on the +-0.42118168 impact on the +-0.54593325 determination on the +-0.6892329 based on the +-0.6360376 concentrate on the +-0.54593325 role on the +-0.6360376 effect on the +-0.67076755 rely on the +-0.6360376 Depending on the +-0.54593325 Or on the +-0.54593325 expense on the +-0.54593325 strain on the +-0.31894603 Based on the +-0.54593325 Queens on the +-0.8090081 ban on the +-1.0516962 is also the +-1.7538673 the time the +-1.1549811 even if the +-0.8240756 But if the +-0.8240756 determine if the +-1.5754697 , then the +-1.3415871 , but the +-1.0069131 to pay the +-1.0924603 help pay the +-1.0086755 will have the +-1.0495377 not have the +-1.7214205 to have the +-1.2299025 students have the +-1.3222961 should have the +-1.247029 they have the +-0.7971681 workers have the +-0.7971681 hardly have the +-1.5069013 And the +-0.920691 teach some the +-1.4537649 , so the +-1.108885 to : the +-1.4287155 To the +-1.4710581 has become the +-1.6894344 , or the +-0.9084117 employees or the +-1.1421975 and even the +-0.8442111 fulfill even the +-1.1459138 job are the +-1.4763844 They are the +-0.9038995 children are the +-0.709266 may offer the +-0.709266 They offer the +-0.52407384 employment lays the +-0.7857646 weigh carefully the +-0.43407604 to consider the +-0.66309506 must consider the +-1.3477215 By the +-0.7162568 experience makes the +-0.7162568 It makes the +-0.8545207 background helps the +-0.4851436 with understanding the +-0.4851436 for understanding the +-0.4851436 as understanding the +-0.8079847 Besides the +-0.8196111 time through the +-0.8196111 gained through the +-0.55972874 themselves into the +-0.55972874 late into the +-0.653077 insight into the +-0.55972874 assimilation into the +-0.55972874 toes into the +-0.55972874 women into the +-0.53586376 could increase the +-0.53586376 never increase the +-0.560826 part-time at the +-0.43026197 study at the +-0.71008927 work at the +-0.560826 then at the +-0.560826 degree at the +-0.560826 abilities at the +-0.560826 all at the +-0.560826 days at the +-0.560826 needed at the +-0.560826 perform at the +-0.560826 mesmerized at the +-0.560826 winning at the +-0.560826 Troubles at the +-0.560826 smoke at the +-0.52407384 Saving the +-0.85015655 them put the +-1.0403702 to see the +-0.6800502 fre putting the +-1.3577589 can help the +-1.3333353 will help the +-0.8878894 is when the +-0.8878894 time when the +-1.8613932 to do the +-0.81229526 they take the +-0.81229526 would take the +-0.72796875 can use the +-0.72796875 must use the +-1.1329075 for classes the +-1.4412572 of what the +-0.87582356 tell what the +-1.0116953 In conclusion the +-0.7665813 For all the +-0.92862236 from all the +-0.7665813 tired all the +-0.7665813 explore all the +-0.7665813 understand all the +-0.9157402 , because the +-0.8240432 give them the +-1.056014 , therefore the +-1.7402424 If the +-1.3468446 can get the +-1.4565918 to get the +-0.42525458 will teach the +-0.9074033 knew how the +-0.8633423 takes away the +-1.0192885 will give the +-0.8754517 to give the +-1.4036587 there fs the +-0.51832384 through entering the +-0.51832384 transition entering the +-0.51832384 postpone entering the +-0.82929325 make up the +-0.6958216 weigh up the +-0.6958216 setting up the +-0.6958216 speed up the +-1.0292808 to make the +-1.0241084 should make the +-0.5890107 in which the +-0.8667015 against which the +-0.52407384 usually restrict the +-0.9943956 job during the +-0.62023175 wastes during the +-0.62023175 element during the +-0.62023175 unproductive during the +-0.90131474 is rarely the +-0.88746697 give customers the +-1.0741897 to enjoy the +-0.9123231 really enjoy the +-0.8303721 college provides the +-0.9937909 will improve the +-0.8523332 n't place the +-0.8728262 would were the +-1.1107861 , where the +-0.8281111 environments where the +-1.480279 to find the +-1.1906312 to explore the +-0.8875898 enough without the +-0.8780043 is about the +-0.7311033 part about the +-0.7311033 thinking about the +-0.33377156 to overlook the +-0.33377156 would overlook the +-0.80146855 I support the +-0.80146855 do support the +-0.747595 not keep the +-0.4575826 and ease the +-0.5296353 help ease the +-0.8452107 can learn the +-0.8425536 to learn the +-0.8452107 also learn the +-0.66980684 better learn the +-0.8452107 must learn the +-0.664711 should let the +-0.664711 And let the +-0.8996507 problems than the +-0.8079847 Despite the +-0.87345314 to enter the +-0.87345314 they enter the +-1.077419 to manage the +-0.4575826 lying beyond the +-0.4575826 worlds beyond the +-0.583285 skills outside the +-0.583285 learned outside the +-0.583285 life outside the +-1.2179445 to meet the +-0.30696866 be against the +-0.30696866 completely against the +-0.30696866 's against the +-0.25972024 am against the +-0.30696866 considered against the +-0.30696866 smoke against the +-0.7857646 heavy load the +-0.84035224 way until the +-1.4533409 to go the +-0.9741635 student needs the +-0.71465766 really needs the +-0.47637358 to understand the +-0.41205218 students understand the +-0.47637358 better understand the +-0.41205218 would understand the +-0.41205218 fully understand the +-0.52407384 for maturing the +-0.8264364 they lack the +-0.9937909 and allow the +-1.2033625 not always the +-0.8599074 approach since the +-0.664711 may change the +-0.664711 easily change the +-1.4025711 the workplace the +-0.6884598 ground between the +-0.6884598 divided between the +-0.81038797 that hold the +-0.90131474 to sample the +-0.8925224 parents want the +-0.747595 , hopefully the +-0.38091126 to appreciate the +-0.48211288 students appreciate the +-0.48211288 better appreciate the +-0.7857646 the harder the +-0.85015655 have made the +-0.50672495 to reduce the +-0.43809736 help reduce the +-0.43809736 significantly reduce the +-0.747595 they reach the +-0.8660633 study week the +-0.90131474 may cause the +-0.85015655 for both the +-0.33377156 can enrich the +-0.33377156 greatly enrich the +-1.077419 and ultimately the +-0.96760607 to fulfill the +-0.52407384 will motivate the +-0.82761496 places around the +-0.52407384 will familiarize the +-0.4575826 in choosing the +-0.4575826 By choosing the +-0.8079847 Taking the +-1.3245189 Even the +-0.52407384 be assured the +-0.4575826 independence among the +-0.4575826 resistance among the +-0.36989835 On the +-1.0766768 Although the +-0.52407384 -RRB- attain the +-0.8642653 must balance the +-0.96760607 Balancing the +-0.83314687 should assist the +-0.6800502 to recognize the +-0.6800502 , experiencing the +-0.7857646 particularly towards the +-0.8264364 feedback form the +-0.90131474 to receive the +-0.82761496 for everything the +-1.0766768 With the +-0.6241794 At the +-1.0116953 Also the +-0.8135694 smoking within the +-1.0329242 to cover the +-0.30938736 , reducing the +-0.747595 to buy the +-0.52407384 thereby reviving the +-0.6800502 savings onto the +-0.52407384 eventually breakdown the +-0.747595 , leaving the +-0.52407384 student selected the +-0.52407384 in discovering the +-0.96760607 be taught the +-0.52407384 to join the +-0.52407384 work restricts the +-0.52407384 generally leaves the +-0.52407384 helps defray the +-0.4575826 for probably the +-0.4575826 , probably the +-0.8079847 , namely the +-0.81038797 my mind the +-0.6800502 will miss the +-0.52407384 studies lay the +-0.6800502 earned versus the +-0.6800502 ultimately win the +-0.52407384 to invest the +-0.52407384 student appreciates the +-0.52407384 who extol the +-0.52407384 hour digesting the +-0.52407384 purchases disturbs the +-0.52407384 by saddling the +-0.6800502 is merely the +-0.52407384 and hamstring the +-0.52407384 his wife the +-0.6800502 graduated near the +-0.52407384 and running the +-0.90131474 is nonetheless the +-0.6800502 Putting aside the +-0.52407384 this negates the +-0.52407384 would represent the +-0.52407384 in considering the +-0.52407384 Inside the +-0.52407384 are affecting the +-0.52407384 it alters the +-0.52407384 that concerning the +-0.52407384 to promote the +-0.9553065 upon this student +-1.2661562 for a student +-0.8009644 can a student +-1.2314777 that a student +-1.3171282 is a student +-1.1520282 , a student +-0.55941105 by a student +-1.3329911 in a student +-1.1668518 of a student +-0.99516034 to a student +-1.2786714 as a student +-0.9794965 being a student +-0.2896474 if a student +-1.1375568 such a student +-1.1068897 into a student +-0.22592244 If a student +-0.8009644 still a student +-0.9794965 lead a student +-0.8009644 once a student +-0.8009644 force a student +-0.8009644 improve a student +-0.8009644 giving a student +-0.8009644 When a student +-0.8009644 seeing a student +-0.8009644 gauge a student +-0.8009644 challenge a student +-0.8009644 deter a student +-0.8009644 Obliging a student +-0.8237419 the student +-1.2912413 for the student +-0.68202734 that the student +-1.129515 , the student +-0.954006 by the student +-1.2672464 in the student +-0.8237419 field the student +-1.18418 of the student +-1.2732176 to the student +-1.302538 from the student +-1.0382692 on the student +-0.8237419 time the student +-1.1024476 if the student +-1.1024476 pay the student +-1.0143301 makes the student +-0.8237419 helps the student +-0.41787416 help the student +-1.0143301 take the student +-0.41787416 teach the student +-0.8237419 away the student +-0.41787416 give the student +-0.56994617 which the student +-0.8237419 restrict the student +-1.25734 against the student +-1.0143301 change the student +-0.8237419 hopefully the student +-0.8237419 cause the student +-0.8237419 both the student +-0.8237419 motivate the student +-0.8237419 familiarize the student +-0.8237419 disturbs the student +-1.2609558 cards , student +-1.4310304 Furthermore , student +-0.8711163 by any student +-1.0901948 of any student +-0.8149672 an engineering student +-2.2307382 The student +-1.9714158 for college student +-0.25378907 a college student +-0.8020794 full-time college student +-0.83188283 the college student +-0.8020794 any college student +-0.8020794 or college student +-0.8020794 when college student +-0.8020794 young college student +-0.859884 more productive student +-0.8926364 this same student +-1.7827749 A student +-0.702775 can contribute student +-0.5398157 Each student +-1.5051687 in my student +-0.9398239 well rounded student +-0.8103189 the individual student +-0.5105843 Every student +-0.702775 The modern student +-0.5398157 government sponsored student +-0.5398157 the cloistered student +-0.5398157 a first-year student +-0.5398157 the questing student +-0.5398157 A high-school student +-0.5398157 financially strapped student +-2.0774298 , I will +-0.9402039 U.S. I will +-0.9169818 job this will +-1.1690172 and this will +-0.9586263 there it will +-0.6990433 experience that will +-0.8688624 f that will +-0.8688624 skill that will +-1.3294083 skills that will +-1.1926079 life that will +-0.8688624 responsibilities that will +-0.8688624 lesson that will +-0.8688624 problems that will +-1.5518652 part-time job will +-2.1632748 time job will +-1.8643875 a student will +-1.3702692 the student will +-1.6579925 college student will +-1.9851167 time , will +-0.95400727 produce , will +-1.2375623 expectations , will +-1.2375623 instance , will +-0.94722396 But many will +-1.598336 life and will +-0.9566347 debt and will +-1.5301561 of course will +-1.0375855 work experience will +-1.582016 , there will +-1.0921868 but there will +-1.5951837 that students will +-0.75647396 the students will +-1.7013508 , students will +-0.9055203 working students will +-2.0443506 college students will +-1.5117801 their studies will +-2.7215285 part time will +-1.2753175 time jobs will +-1.2594568 Part-time jobs will +-0.8403157 part jobs will +-0.8403157 all jobs will +-0.9136081 their focus will +-1.835027 part-time work will +-1.3702801 and work will +-1.7692668 in college will +-1.6385278 of college will +-0.9561792 at college will +-1.7194924 extra money will +-1.0735996 that they will +-0.76402617 job they will +-1.004603 , they will +-1.1029477 and they will +-0.76402617 studying they will +-1.128304 as they will +-0.76402617 college they will +-1.0791597 money they will +-1.1918464 what they will +-0.3999682 world they will +-0.7020288 where they will +-0.9952262 Maybe they will +-1.4668181 , you will +-0.92240727 whose career will +-0.7467361 that employers will +-0.7467361 Potential employers will +-1.386025 Such skills will +-0.8360971 skills developed will +-0.8801311 part-time position will +-1.9734569 There will +-1.8830712 young people will +-0.8940015 They will +-0.8543972 all eventually will +-1.200639 This will +-1.2204936 to what will +-1.2129856 and therefore will +-0.69894636 might produce will +-2.024976 It will +-1.7669585 in life will +-0.9121253 added income will +-0.9422459 money which will +-0.9450191 fs education will +-1.2069107 working world will +-0.8686315 large universities will +-0.90941113 a workplace will +-0.8867227 poorer families will +-1.005025 Others will +-1.4199708 college days will +-0.90547687 not he will +-0.5371813 filing correspondence will +-0.81104904 these possibilities will +-0.76975334 and hierarchies will +-0.8981023 then we will +-0.8981023 College we will +-0.5371813 social rejection will +-0.5371813 , he\/she will +-1.686524 all restaurants will +-1.6374048 student will acquire +-2.0991783 do not acquire +-0.96771485 ways to acquire +-2.0046606 order to acquire +-0.9649013 take it upon +-1.7082255 full-time job upon +-0.81729704 will acquire upon +-0.81729704 same organization upon +-0.88694775 is therefore upon +-0.5410506 lifestyle adjustment upon +-1.4133127 , however upon +-0.5410506 computer programmer upon +-0.5410506 his\/her reliance upon +-0.5410506 rarely relied upon +-0.36178198 job upon graduation +-0.36178198 acquire upon graduation +-0.36178198 organization upon graduation +-0.36178198 adjustment upon graduation +-0.36178198 programmer upon graduation +-1.7917953 for their graduation +-2.0014944 to their graduation +-0.9453723 after their graduation +-1.1327 world following graduation +-0.69363415 time after graduation +-0.59214056 offer after graduation +-0.69363415 opportunities after graduation +-0.59214056 them after graduation +-0.59214056 life after graduation +-0.69363415 until after graduation +-1.1000702 or her graduation +-0.7050885 car post graduation +-0.9058509 begin with , +-0.8545582 from this , +-0.8545582 doing this , +-1.1336588 this statement , +-0.91844374 begging for , +-1.1542828 several reasons , +-1.0673444 two reasons , +-0.7124748 these reasons , +-0.38603947 above reasons , +-0.9084074 know it , +-1.2827783 to that , +-1.1554365 than that , +-1.3891904 working part-time , +-1.137892 Working part-time , +-1.1770948 a job , +-0.9923402 part-time job , +-0.9567024 any job , +-0.8362206 time job , +-0.7857207 career job , +-0.7857207 selecting job , +-0.7857207 real job , +-1.6827123 it is , +-0.90611964 health is , +-1.39528 a student , +-1.3101169 the student , +-0.7700374 engineering student , +-1.0378609 college student , +-0.7700374 first-year student , +-0.7700374 strapped student , +-0.7780443 upon graduation , +-0.794649 Quite often , +-0.794649 More often , +-1.4297211 can not , +-1.5709324 do not , +-0.79624313 this case , +-0.7424424 many cases , +-0.6329169 some cases , +-0.6303149 extreme cases , +-1.0964175 Part-time employment , +-1.7400442 job in , +-1.1396223 a field , +-0.59873474 ; and , +-0.88957345 family and , +-0.88957345 companies and , +-0.9184125 enough of , +-0.571 by nature , +-0.571 very nature , +-1.1809547 tend to , +-0.5351902 of course , +-0.98334783 Of course , +-0.75446147 , study , +-1.322278 of study , +-0.97906196 their study , +-0.75446147 beyond study , +-0.7946591 for example , +-0.069481574 For example , +-0.7717589 with studying , +-1.0084783 on studying , +-0.7717589 spend studying , +-1.047094 very little , +-1.4130744 the experience , +-0.7812811 While working , +-1.0250342 by working , +-0.7812811 either working , +-0.7812811 always working , +-0.7812811 week working , +-0.8929926 a restaurant , +-0.2250479 Moreover , +-0.7005471 this reason , +-0.8988212 the students , +-0.8129096 many students , +-1.0267082 school students , +-1.1749005 college students , +-1.2612681 some students , +-1.2612681 most students , +-1.0267082 expose students , +-0.83168256 busy students , +-0.8061902 an additional , +-0.79624313 causes stress , +-0.91399026 money on , +-0.89717114 their studies , +-0.77102125 from studies , +-0.88430023 's studies , +-0.77102125 my studies , +-0.6522234 tertiary studies , +-0.8510988 time also , +-1.2835143 but also , +-0.77285963 be increased , +-0.84908485 smoke less , +-1.107523 the time , +-1.0570823 , time , +-1.0752374 of time , +-1.066525 their time , +-0.6804738 study time , +-0.5278867 on time , +-0.9490432 extra time , +-1.3494723 part time , +-0.5965768 same time , +-0.885332 limited time , +-0.73631537 over time , +-0.3010299 Additionally , +-1.3371787 part-time jobs , +-0.76095927 full-time jobs , +-0.76095927 real jobs , +-0.76095927 graduate jobs , +-0.76095927 basic jobs , +-0.8494755 part-time one , +-0.8494755 Reason one , +-0.571 one tired , +-0.571 so tired , +-0.7534833 of focus , +-1.1384271 to focus , +-1.0133569 from school , +-0.74062383 only school , +-0.39247334 through school , +-0.74062383 nursery school , +-0.76655287 of work , +-1.1529888 to work , +-0.94653624 course work , +-0.77883697 at work , +-0.77883697 menial work , +-1.1854349 hard work , +-1.020759 days work , +-0.77883697 volunteer work , +-0.14395507 Therefore , +-0.8473476 as such , +-0.8473476 As such , +-0.4829263 this -RRB- , +-0.4829263 less -RRB- , +-0.4829263 internship -RRB- , +-0.4829263 classes -RRB- , +-0.4829263 responsibilities -RRB- , +-0.5597145 etc. -RRB- , +-0.4829263 assignments -RRB- , +-0.4829263 renovations -RRB- , +-0.80245686 did then , +-0.80245686 Even then , +-0.79093254 worth pursuing , +-0.3010299 -LRB- i.e. , +-0.8619939 her pay , +-0.59974194 for tuition , +-0.703259 college tuition , +-0.59974194 own tuition , +-0.6484694 or fees , +-0.6484694 university fees , +-0.2250479 Yes , +-0.793422 In college , +-0.90229183 in college , +-1.2346969 of college , +-0.7794157 to college , +-1.2540758 at college , +-0.793422 enter college , +-0.793422 leaving college , +-1.6667551 they have , +-0.6949234 , money , +-1.051182 of money , +-0.8280723 have money , +-0.9139353 need money , +-0.986887 earn money , +-0.37703368 own money , +-0.5892297 save money , +-0.8280723 spending money , +-0.918723 Neither they , +-0.8061902 To say , +-1.3589725 And , +-1.303915 for some , +-0.8430897 And so , +-1.1967878 do so , +-1.2893203 the following , +-0.98740363 To begin , +-0.8735679 As well , +-0.5054511 on-line portals , +-0.8072969 labor positions , +-1.1297541 reasons are , +-1.0827463 future career , +-0.7523257 as internships , +-0.82355994 educational experiences , +-0.5054511 Primarily , +-1.0580888 the future , +-1.0584581 their future , +-0.8507593 my future , +-0.8801082 fs more , +-0.8573708 focused better , +-0.6841598 valuable skills , +-0.6841598 practical skills , +-0.81351113 leadership skills , +-0.81351113 interpersonal skills , +-0.6841598 transferable skills , +-0.6841598 basic skills , +-2.0038903 is important , +-0.7168725 factors include , +-0.32447687 , commitment , +-0.32447687 of commitment , +-0.74163294 time management , +-0.77285963 Besides , +-0.83274627 knowledge , +-0.7258665 individual personality , +-0.7832159 hurting themselves , +-0.9529943 smokers themselves , +-0.8423041 healthy environment , +-0.82355994 manager position , +-0.9102877 eat at , +-0.8421311 Reason two , +-1.2932373 The first , +-0.79743546 way first , +-0.81603056 an issue , +-0.8784392 could spend , +-0.69896686 , food , +-0.69896686 of food , +-0.5963565 to food , +-0.78122586 Often times , +-0.9500556 is earned , +-1.0720284 are good , +-0.6880008 asking others , +-0.6880008 innumerable others , +-0.87509084 's parents , +-0.6065419 of friends , +-0.6065419 or friends , +-0.6065419 dissolute friends , +-0.12515737 However , +-1.080443 workplace relationships , +-0.8537015 for people , +-0.71363693 all people , +-0.71363693 different people , +-1.0571682 young people , +-0.71363693 meeting people , +-0.71363693 aged people , +-1.6235546 This , +-0.79093254 any effort , +-0.6536211 a stable , +-1.0135729 People , +-1.412145 can help , +-1.2410972 to earn , +-0.68523335 actually earn , +-0.96445125 their major , +-0.83139384 good use , +-0.57310355 and classes , +-0.57310355 to classes , +-0.33045125 their classes , +-0.57310355 Attending classes , +-0.8356169 employers know , +-1.1013983 This could , +-0.36077535 In conclusion , +-0.8219718 In all , +-1.1845218 of all , +-0.8219718 after all , +-1.3944597 for them , +-0.8696285 suits them , +-0.99564755 , therefore , +-0.7077 this society , +-0.8455261 to society , +-0.8455261 modern society , +-1.0874364 have done , +-0.77285963 of specialty , +-0.6536211 they produce , +-0.7750757 factor itself , +-0.8581748 their subject , +-0.9500556 their grades , +-0.7168725 some menial , +-0.5054511 a specialist , +-0.79093254 their colleges , +-0.7372181 same activities , +-0.78153884 extra-curricular activities , +-0.62627375 extracurricular activities , +-0.62627375 recreational activities , +-0.87297237 their social , +-0.6536211 , teachers , +-0.8026241 time workers , +-1.7193165 I believe , +-1.0997143 for life , +-0.84649974 in life , +-0.88417566 their life , +-0.39079383 students life , +-0.73549455 later life , +-0.9528483 the responsibilities , +-0.8886159 of responsibilities , +-0.68012416 then university , +-0.80808395 at university , +-0.68012416 At university , +-0.8192618 show up , +-1.0074013 grow up , +-0.79093254 in late , +-0.32447687 the night , +-0.32447687 at night , +-0.9080702 , sleeping , +-0.9500556 a result , +-0.9185646 this regard , +-0.4440249 many countries , +-0.4440249 western countries , +-0.571 in debt , +-0.571 reduce debt , +-0.60953134 use cards , +-0.7946591 credit cards , +-0.60953134 student loans , +-0.60953134 of loans , +-0.5054511 cash flow , +-0.12515737 Firstly , +-0.7168725 student myself , +-0.7523257 social pressures , +-0.9917311 living expenses , +-0.48406172 Often , +-0.8749945 across these , +-0.85152805 and family , +-0.6536211 family savings , +-1.0796709 an adult , +-0.8049473 become independent , +-0.7582994 financially independent , +-0.032179095 Secondly , +-0.32447687 distracter Secondly , +-0.5054511 employment prospects , +-0.79093254 second factor , +-0.78122586 , ability , +-0.9063807 the customers , +-0.9735131 of customers , +-1.014048 a responsible , +-0.5054511 these attributes , +-0.12069885 Thirdly , +-0.5054511 wider scale , +-0.7523257 both arguments , +-0.28175518 , however , +-0.3740522 reason however , +-0.3740522 ; however , +-0.3740522 arguments however , +-0.5054511 financial condition , +-0.5029518 final year , +-0.5029518 sophomore year , +-0.5029518 4th year , +-1.1292903 of education , +-0.48425156 their education , +-1.284903 college education , +-0.95485777 the world , +-0.94974303 real world , +-0.5054511 like prioritization , +-0.5054511 , multitasking , +-0.83667386 academic success , +-0.70123714 own success , +-0.7596308 unnecessary burden , +-0.8157758 first place , +-0.87726665 going out , +-0.9993002 for learning , +-0.81399155 higher learning , +-0.7750757 changing interests , +-0.858783 studied about , +-0.6536211 only partially , +-0.5054511 A blanket , +-0.3010299 school supplies , +-0.98740363 negative impact , +-0.7523257 their assignments , +-0.6536211 pretty straightforward , +-1.0670217 , support , +-0.4683491 it second , +-0.4683491 , second , +-0.28378856 and second , +-0.31026715 In addition , +-0.7954967 in addition , +-0.77285963 convenience store , +-0.7168725 their subjects , +-0.81679595 attend lessons , +-0.7168725 professional contacts , +-0.9114017 So , +-0.6536211 health risks , +-0.7750757 risks though , +-0.8763361 as possible , +-0.6903406 all possible , +-0.948759 college years , +-0.78034645 3 years , +-0.8581748 , medicine , +-0.83274627 Reason three , +-1.0874364 to enter , +-0.6484694 to come , +-0.6484694 shall come , +-0.3010299 Lastly , +-1.047261 something new , +-0.7168725 it offers , +-0.7523257 course load , +-0.81603056 , books , +-0.571 part-time now , +-0.571 conclude now , +-0.8372681 his\/her needs , +-0.5054511 ; moreover , +-0.5054511 Nevertheless , +-1.0969858 study hard , +-0.5054511 for resting , +-0.6536211 extreme solution , +-0.3010299 Generally , +-0.9820025 a problem , +-0.5054511 about budgeting , +-0.91774625 work ethics , +-0.84727 their workplace , +-0.7523257 major concern , +-0.9476068 But , +-0.77285963 by definition , +-0.8061902 with problems , +-0.99564755 , perhaps , +-0.91774625 Others , +-0.5054511 perhaps older , +-0.7523257 college-level courses , +-0.7168725 proper rest , +-0.81633264 their lives , +-1.1998285 these things , +-1.4045392 Students , +-0.79093254 small tasks , +-0.64017874 not Japan , +-0.99262404 in Japan , +-0.64017874 including Japan , +-0.94101346 , rent , +-1.029356 of freedom , +-0.7168725 community service , +-0.8072969 If anything , +-0.66709507 the law , +-0.571 as law , +-0.5054511 are smart , +-0.5054511 doing meaningless , +-0.6536211 student knows , +-0.51014364 working hours , +-0.51014364 school hours , +-0.6236856 20 hours , +-0.51014364 long hours , +-1.2383217 college days , +-0.8550956 following day , +-0.6536211 of frustration , +-0.11608247 First , +-0.4440249 : First , +-0.8610325 a home , +-0.7535447 , responsibility , +-0.7535447 or responsibility , +-0.5054511 to bed , +-0.7168725 be sure , +-1.0560308 doing homework , +-0.5054511 university dormitories , +-0.5054511 preparing breakfast , +-0.7168725 above mentioned , +-0.68248343 his independence , +-0.68248343 fiscal independence , +-0.6536211 is enormous , +-0.6536211 , dating , +-0.8581748 much easier , +-0.6536211 third party , +-0.91774625 In Australia , +-0.7168725 being studied , +-0.81160414 in companies , +-0.6536211 In summary , +-0.6536211 of wisdom , +-1.2177331 in class , +-0.7596308 their concentration , +-0.7258665 one company , +-0.539533 the end , +-1.1264849 an individual , +-0.8263623 instils discipline , +-0.5189412 no expectations , +-0.5189412 realistic expectations , +-0.70600426 in age , +-0.8431988 early age , +-0.2250479 Second , +-0.5054511 Usually , +-0.6536211 in turn , +-0.5054511 have secretary , +-0.7168725 , manager , +-0.6536211 a treasurer , +-0.3010299 for instance , +-0.7168725 to budget , +-0.5029518 one hand , +-0.45051283 other hand , +-0.5029518 kitchen hand , +-0.5054511 an instrument , +-0.721702 The opinion , +-1.0683016 my opinion , +-0.9820025 for us , +-0.7523257 stated above , +-0.5054511 as hard-work , +-0.2250479 , teamwork , +-1.0135729 Although , +-0.94101346 appropriate policy , +-0.6536211 our schooling , +-0.20945263 Thus , +-0.32447687 h. Thus , +-0.82449645 life balance , +-0.32447687 the argument , +-0.32447687 second argument , +-0.7168725 class schedules , +-0.5054511 appropriately dressed , +-0.6536211 their classmates , +-0.4440249 with lectures , +-0.4440249 of lectures , +-0.77285963 Otherwise , +-0.6536211 are wealthy , +-0.5054511 in politics , +-0.7168725 is expensive , +-1.1295513 their children , +-0.7258665 years old , +-0.5054511 Hence , +-0.91774625 a resume , +-0.8026241 Before long , +-0.6536211 late nights , +-0.78122586 mental growth , +-0.6536211 their initiative , +-0.5054511 problem solving , +-0.5054511 , communication , +-0.36077535 Also , +-0.8157758 towards spending , +-0.7750757 This choice , +-0.5054511 -LRB- although , +-0.5054511 Regardless , +-0.9080702 , thereby , +-0.5054511 with financing , +-0.571 , materials , +-0.571 study materials , +-0.5054511 latest products , +-0.7168725 is low , +-0.2250479 Furthermore , +-0.3962553 work schedule , +-0.6536211 fs head , +-0.5054511 and excels , +-0.05357327 Finally , +-0.32447687 h Finally , +-0.5054511 basic necessities , +-0.3010299 New Zealand , +-0.5054511 Ultimately , +-0.7523257 you wanted , +-0.5054511 Naturally , +-0.32447687 station attendant , +-0.32447687 parking attendant , +-0.5054511 rental clerk , +-0.3010299 Third , +-0.77285963 to excel , +-0.6536211 ask ourselves , +-0.8581748 fancy clothes , +-0.7523257 age groups , +-0.6536211 business etiquette , +-0.77285963 a teacher , +-0.5054511 in tourism , +-0.9080702 the weekends , +-0.5054511 very challenging , +-0.5054511 the oppressive , +-0.6536211 world exists , +-0.7168725 concentrated thought , +-0.5054511 not apply , +-0.79093254 healthy habits , +-0.5054511 daytime television , +-0.5054511 domestic front , +-0.6536211 they hobbies , +-0.5054511 job interview , +-0.5054511 Well , +-0.6536211 money handling , +-0.5054511 own accounts , +-0.5054511 personal note , +-0.5054511 Interestingly , +-0.5054511 outright demanding , +-0.5054511 ski trips , +-0.5054511 those dollars , +-0.77285963 and consequently , +-0.5054511 parental accommodation , +-0.6536211 and third , +-0.5054511 , diligence , +-0.4440249 that said , +-0.4440249 That said , +-0.5054511 This dimension , +-0.5054511 club membership , +-0.94101346 in mind , +-0.5054511 Personally , +-0.5054511 without starving , +-0.6536211 pass exams , +-0.5054511 social clicks , +-0.5054511 Consequently , +-0.5054511 about networking , +-0.5054511 economic crisis , +-0.48406172 Currently , +-0.8581748 most obvious , +-0.5054511 Vegas casino , +-0.5054511 Recently , +-0.5054511 are skilled , +-0.5054511 some luck , +-0.6536211 finally dropped , +-0.9529943 the restaurants , +-1.1622612 in restaurants , +-0.5054511 of demons , +-0.5054511 his soul , +-0.5054511 wage drudgery , +-0.5054511 Again , +-0.5054511 the carrels , +-0.5054511 passing examinations , +-0.5054511 Conversely speaking , +-0.6536211 and depth , +-0.6536211 of five , +-0.6536211 become distracted , +-0.6536211 vogue expression , +-0.5054511 was 18 , +-0.5054511 have liked , +-0.7168725 , nonetheless , +-0.7523257 better approach , +-0.5054511 ehealthy body , +-0.5054511 from over-exertion , +-0.5054511 their outlook , +-0.5054511 , depression , +-0.5054511 be honest , +-0.3010299 Indeed , +-0.5054511 on cars , +-0.5054511 , cigarettes , +-0.5054511 a bartender , +-0.5054511 developed nations , +-1.1024163 up smoking , +-0.5054511 polar extremes , +-0.6458348 to smoke , +-0.8605761 who smoke , +-0.7186056 breathe smoke , +-0.6536211 but regardless , +-0.6536211 a separate , +-0.7168725 smoking patrons , +-0.5054511 lax standards , +-0.5054511 getting cancer , +-0.7168725 right choices , +-1.0772513 of smokers , +-0.6536211 habit-forming poison , +-0.5054511 become addicted , +-0.5054511 Principally , +-0.5054511 most importantly , +-0.5054511 , long-term , +-0.5054511 ; firstly , +-0.5054511 non-smoking diners , +-0.5054511 about gradually , +-1.4710355 it can often +-1.7901208 job can often +-2.2044797 can be often +-1.6299547 this is often +-1.7564131 which is often +-1.2292926 Money is often +-0.969541 load , often +-0.96764123 society and often +-0.96357906 friends as often +-1.9336957 , students often +-1.6037164 most students often +-1.7115598 College students often +-2.2953916 part-time jobs often +-0.94125336 are then often +-1.8230585 and they often +-1.6357323 jobs are often +-1.5080755 It fs often +-0.7036732 unfortunate reality often +-0.6419974 Too often +-0.54043275 Quite often +-0.54043275 More often +-0.54043275 socio-economic backgrounds often +-1.0911839 they can not +-0.856264 colleges can not +-1.227122 which can not +-0.856264 large can not +-0.856264 individuals can not +-2.5802686 part-time job not +-1.2725607 it is not +-1.0639998 job is not +-1.4691745 student is not +-1.3682237 college is not +-1.2863693 money is not +-1.0982476 This is not +-0.8805072 provide is not +-1.5791577 It is not +-1.105859 she is not +-1.455275 students will not +-1.5862463 they will not +-0.88630664 therefore will not +-0.88630664 produce will not +-1.7572217 work , not +-1.3729982 society , not +-0.9452503 specialist , not +-0.9452503 pressures , not +-1.8650335 Secondly , not +-1.2208924 responsibility , not +-1.337265 is often not +-0.9510857 help by not +-1.9962677 , and not +-1.4449699 studying and not +-1.3489295 working and not +-0.93610466 compromised and not +-0.93610466 maximum and not +-1.5256436 that of not +-1.617362 importance of not +-1.8512043 students to not +-0.9488 must study not +-0.95581913 save from not +-1.7605785 help students not +-0.94335085 for studies not +-0.949543 us also not +-0.9607516 his time not +-1.9009188 , if not +-0.6377787 job may not +-0.9598585 students may not +-0.6825161 jobs may not +-0.8935795 they may not +-0.8112986 or may not +-0.6825161 who may not +-0.6825161 idea may not +-1.4096886 , but not +-0.93454605 would otherwise not +-1.0846384 it should not +-0.95678824 that should not +-1.1412284 student should not +-0.95678824 and should not +-0.9272633 students should not +-1.216419 they should not +-1.923592 they have not +-1.501558 others have not +-0.8553437 that 's not +-1.0643771 That 's not +-0.6806919 degree does not +-0.6806919 alternative does not +-1.2636061 job or not +-0.85743487 graduation or not +-0.85743487 whether or not +-1.0677662 important or not +-0.85743487 Whether or not +-1.137588 that are not +-0.88265425 students are not +-1.102709 jobs are not +-0.78456247 but are not +-0.91934794 they are not +-0.8145805 you are not +-0.78456247 classes are not +-0.78456247 Workers are not +-0.9549865 Internships are not +-0.9549865 finances are not +-1.102709 we are not +-0.78456247 averages are not +-1.4342213 By not +-1.5716525 is important not +-0.855489 something productive not +-0.95205367 urge people not +-0.48664796 I do not +-0.55079913 that do not +-0.6772661 students do not +-0.55079913 only do not +-0.36217004 they do not +-0.42416182 who do not +-0.55079913 people do not +-0.55079913 families do not +-0.55079913 age do not +-0.6772661 we do not +-0.55079913 programs do not +-1.2122065 parents could not +-1.3415449 students must not +-1.967902 If not +-0.85616755 experience might not +-0.91281086 they would not +-0.994255 that were not +-0.7634556 Others were not +-0.9434385 we learn not +-1.1042805 is best not +-0.6490017 I did not +-0.5295035 not did not +-0.5295035 certainly did not +-0.9156895 work was not +-0.8883863 so why not +-0.869084 are simply not +-0.5377069 but maybe not +-0.91822034 who had not +-0.83792454 personal growth not +-0.5377069 thought patterns not +-0.77176845 also probably not +-0.9592244 In this case +-0.4530813 not the case +-0.9620223 always the case +-0.9620223 nonetheless the case +-0.970705 case to case +-1.5376698 different from case +-0.9710474 three , In +-1.7515777 , with many +-1.4260962 job for many +-1.4927022 necessary for many +-1.3337357 responsibility for many +-0.9301687 struggle for many +-0.9566641 follows that many +-0.9566641 popular that many +-1.2341511 Additionally , many +-1.812271 college , many +-1.5793486 classes , many +-1.3919405 Second , many +-1.2341511 Indeed , many +-1.0040731 In many +-0.95984316 found in many +-0.95984316 participate in many +-1.435204 those of many +-1.2644664 For many +-0.9661226 teaches students many +-1.6257985 taking on many +-1.0717483 to leave many +-1.9856306 students have many +-1.0860109 , so many +-0.9430375 have so many +-0.9430375 are so many +-0.7764559 why so many +-1.6401687 there are many +-0.64907044 There are many +-1.5133669 teaches them many +-1.2359775 person fs many +-1.0979491 Too many +-0.6058438 not too many +-0.7526157 are too many +-0.6058438 where too many +-0.6058438 But too many +-1.0879736 can learn many +-1.3199217 to learn many +-0.81606305 Students learn many +-0.8928803 clubs since many +-1.5096374 But many +-1.0128071 In Australia many +-0.8387759 to mention many +-0.5398157 they share many +-2.7712674 of the cases +-0.84237784 In many cases +-0.4393298 In some cases +-0.3815753 in some cases +-0.44308332 in most cases +-0.7768402 In certain cases +-0.7768402 in extreme cases +-0.5413157 are innumerous cases +-1.2204 experience for any +-0.9449892 around for any +-0.9449892 excuses for any +-1.8319651 think that any +-1.5519679 cases , any +-0.96652985 career , any +-0.9557761 executed by any +-1.2639706 help in any +-0.96319383 short of any +-0.96319383 goal of any +-1.5052602 to gain any +-0.9361676 acquired from any +-0.9361676 profit from any +-0.96331024 And as any +-0.7508714 job or any +-1.381948 to consider any +-2.1229162 to do any +-0.88579214 to entering any +-1.158958 really enjoy any +-0.81255096 , without any +-0.81255096 us without any +-1.3490906 worry about any +-0.9077088 absolutely against any +-0.70341635 student under any +-0.5402563 to utilize any +-0.967417 candidate for employment +-1.0443933 for part-time employment +-1.1818658 any part-time employment +-1.0443933 in part-time employment +-0.84288615 otherwise part-time employment +-0.43869925 of full-time employment +-1.6513717 life and employment +-2.0720804 field of employment +-1.4246445 aspects of employment +-1.2679098 regards to employment +-2.6202805 part time employment +-1.6830821 full time employment +-1.0649763 Part-time employment +-1.0677499 for future employment +-1.3627412 their future employment +-0.86172855 their productive employment +-0.5406974 and subsequent employment +-1.6233007 to find employment +-0.5406974 from gainful employment +-0.8166301 The current employment +-1.4515235 part-time employment obtained +-1.7852325 They are obtained +-0.7054753 of qualification obtained +-1.627137 and that by +-1.6769452 full-time job by +-0.96311194 effectiveness is by +-0.89798844 relied upon by +-1.2210732 part-time , by +-1.5509045 working , by +-0.9453461 help , by +-0.9453461 factor , by +-0.9453461 drudgery , by +-0.9453461 gradually , by +-0.46746966 employment obtained by +-0.46746966 qualification obtained by +-2.410001 , and by +-1.4266539 care of by +-0.9489676 academic study by +-0.5377946 that created by +-1.6228267 study time by +-1.2251902 and school by +-1.744286 , then by +-1.151461 the tuition by +-1.3699362 through college by +-0.9441042 inside college by +-2.0972526 of money by +-0.93958426 benefit you by +-0.93897414 learn most by +-0.95610124 organizations or by +-1.650258 can help by +-0.9527328 fed them by +-1.796667 to get by +-1.9965816 I believe by +-1.2080564 taken up by +-0.5377946 largely determined by +-0.6998367 fully supported by +-0.8556713 supported entirely by +-1.2583169 to quit by +-0.6998367 to conclude by +-0.8111665 and higher by +-1.4921283 But by +-0.87975955 effort made by +-0.5377946 and abide by +-0.869272 developed simply by +-1.0649073 and live by +-0.5377946 job executed by +-0.838849 level required by +-0.5377946 phones funded by +-0.5377946 largely financed by +-0.5377946 be subsidized by +-0.5377946 stretched thin by +-0.6998367 not solely by +-0.5377946 is supplemented by +-0.8120665 sharing ideas by +-0.37874544 activities offered by +-0.5377946 will respond by +-0.5377946 dreams dashed by +-0.92245317 Keeping this in +-1.8322288 will be in +-0.83481824 work part-time in +-1.1165993 a job in +-1.359763 part-time job in +-1.2311481 full-time job in +-1.2982445 the job in +-1.4038544 time job in +-1.0256518 good job in +-0.94223624 schedule is in +-1.1451294 very valuable in +-1.8542094 the student in +-1.7035604 college student in +-0.72937226 and , in +-1.1783116 also , in +-1.5648378 Therefore , in +-0.92215943 This , in +-0.92215943 could , in +-1.3137311 society , in +-0.92215943 new , in +-1.3137311 Second , in +-1.0229951 full-time employment in +-0.8293091 find employment in +-0.7530401 are obtained in +-1.1720513 a field in +-1.3704981 school and in +-0.91163635 competitive and in +-0.91163635 presently and in +-0.91163635 personally and in +-0.91163635 loan and in +-0.92421037 not study in +-1.1753789 of studying in +-1.0862695 valuable experience in +-0.9304015 on experience in +-0.76780564 have experience in +-0.76780564 providing experience in +-0.76780564 direct experience in +-0.88135874 , working in +-0.9828689 experience working in +-1.152122 time working in +-0.80319726 And working in +-0.80319726 like working in +-0.8091295 a waiter in +-1.6049798 for students in +-1.3919423 that students in +-1.2042108 , students in +-1.789114 college students in +-1.0551484 Most students in +-0.8496175 benefit students in +-1.1530014 where students in +-0.8496175 brings students in +-0.2143275 not engage in +-0.14460011 to engage in +-0.2143275 They engage in +-0.89913344 so on in +-1.1376266 moving on in +-0.89913344 early on in +-1.9076971 their studies in +-1.5085605 but also in +-0.8091295 be increased in +-0.8239471 this time in +-1.2790653 of time in +-0.41793275 important time in +-1.1028374 first time in +-1.2320322 full time in +-0.8239471 waste time in +-0.8239471 special time in +-0.9692604 to accomplish in +-1.4000177 part-time jobs in +-1.0327939 their jobs in +-1.392677 time jobs in +-0.78569084 available jobs in +-0.78569084 low-skilled jobs in +-0.9238179 be one in +-0.6809026 resulting in +-1.1835883 attending school in +-1.4337263 part-time work in +-1.2760087 , work in +-0.83492327 to work in +-0.42774332 who work in +-0.8593325 find work in +-0.6809026 positions found in +-1.3106468 such an in +-1.5816717 , then in +-1.7391032 , but in +-1.617879 not only in +-0.8901006 as necessary in +-1.6209441 to college in +-1.7990193 the money in +-0.9203034 later so in +-1.6967314 , or in +-1.1560507 -LRB- or in +-0.87629265 do well in +-0.7298822 function well in +-0.7298822 tasks well in +-0.6809026 that appear in +-0.7863839 part-time positions in +-0.6638597 high positions in +-1.2240827 students are in +-1.0413153 they are in +-1.1172764 people are in +-1.3530778 we are in +-0.78685224 these ways in +-0.78685224 of internships in +-0.9437384 take part in +-0.79049295 one skill in +-0.5246688 the gaps in +-1.1308323 money ; in +-1.3969479 are more in +-0.6030081 do better in +-1.0263958 , whether in +-1.335015 these skills in +-1.1961604 so important in +-0.74859107 skills gained in +-0.6892778 you learned in +-0.6892778 theory learned in +-1.1688652 a position in +-0.8483126 social issue in +-0.9128669 they spend in +-0.912379 may need in +-1.2353941 is like in +-0.8514182 fve put in +-1.0636646 and people in +-0.8549032 most people in +-1.0636646 other people in +-0.6321295 carefully especially in +-0.6321295 employers especially in +-1.3361384 will help in +-1.194602 should help in +-0.65680176 job while in +-0.5537776 work while in +-0.6022699 parents while in +-0.6022699 worked while in +-0.6022699 married while in +-0.8288127 and major in +-0.8944049 extremely high in +-0.80433285 in classes in +-0.80433285 some classes in +-0.8541192 with them in +-0.8541192 from them in +-0.58353543 assist them in +-0.8630464 will useful in +-1.3829882 of society in +-0.8091295 to function in +-1.1608983 have done in +-0.62433994 not fit in +-0.5364034 to fit in +-0.81153905 manifest itself in +-1.3803461 they get in +-0.87652874 really get in +-1.2705718 social activities in +-0.5246688 to behave in +-1.2898712 growing up in +-0.95838094 , sleeping in +-1.0897224 are still in +-0.45801318 be helpful in +-0.27884483 quite helpful in +-0.5246688 valuable exercise in +-0.6638597 the graduates in +-0.7863839 recent graduates in +-0.5246688 have difficulty in +-0.9207041 sum which in +-0.8514182 never been in +-0.5912746 a period in +-0.5912746 crucial period in +-0.6809026 a necessity in +-0.8574341 school year in +-0.6809026 financial stake in +-1.0867939 for success in +-0.5364034 valuable training in +-0.5364034 the training in +-0.74859107 stake '' in +-0.8535266 their place in +-1.3857247 the importance in +-1.2623578 worry about in +-0.78685224 relevant factors in +-0.6809026 their effectiveness in +-0.5246688 and falls in +-0.74859107 also keep in +-0.8514182 serve someone in +-0.8742963 head start in +-1.4855702 to learn in +-1.1919992 they learn in +-0.39505577 be successful in +-1.349526 So in +-0.8297333 competitive than in +-0.8297333 possibility than in +-0.89913577 4 years in +-0.79049295 improved performance in +-0.81462 people now in +-0.45801318 is participation in +-0.45801318 then participation in +-0.87526524 are always in +-0.8463113 a change in +-0.6809026 adapt quickly in +-0.8731969 Working in +-0.9027345 to fall in +-0.5246688 fall somewhere in +-0.6809026 school worlds in +-1.2764652 these things in +-0.78685224 worked harder in +-0.86659527 experience freedom in +-0.85342604 prime purpose in +-0.6809026 washing dishes in +-0.5246688 that pales in +-0.88877606 enough hours in +-0.8901006 a day in +-0.74859107 not mentioned in +-1.0791163 a person in +-0.74355984 rounded person in +-0.5912746 it early in +-0.5912746 out early in +-1.176471 financial independence in +-0.81462 the clubs in +-0.81153905 Northeastern University in +-0.6809026 I knew in +-0.6907282 this point in +-0.6907282 important point in +-0.8514182 success both in +-0.9027345 to stay in +-1.02563 is common in +-1.3499931 the individual in +-0.38344136 of interest in +-0.38344136 more interest in +-0.38344136 losing interest in +-0.7530401 gain confidence in +-0.74859107 social hierarchies in +-0.6809026 been reports in +-1.0002635 is essential in +-0.4198898 job later in +-0.4198898 career later in +-0.48547554 need later in +-0.4198898 illness later in +-0.8415857 time actually in +-0.3340653 be engaging in +-0.3340653 , engaging in +-0.74859107 obvious distraction in +-0.8288127 should live in +-0.8091295 important element in +-0.78685224 a stage in +-0.74859107 one country in +-0.5246688 be proactive in +-0.81153905 smoking policy in +-0.5246688 naive mistakes in +-0.6809026 and thinking in +-0.8091295 is costly in +-0.85072994 time spent in +-0.71148264 better spent in +-0.74859107 a currently in +-0.5246688 will explain in +-0.8091295 to survive in +-1.1433995 Being in +-0.2143275 be interested in +-0.2143275 that interested in +-0.2143275 not interested in +-0.6809026 of changes in +-0.43038085 be banned in +-0.5246688 that exist in +-0.5246688 demand involvement in +-0.8091295 to excel in +-0.9692604 be taught in +-0.5246688 , indulging in +-0.5246688 sexual release in +-0.79049295 anti-smoking efforts in +-0.5246688 future professions in +-0.74859107 fall behind in +-0.5246688 to participate in +-0.74859107 are heavily in +-0.9248774 Currently in +-0.6809026 many establishments in +-0.5246688 the tables in +-0.8091295 eEm Poker in +-0.5246688 double majoring in +-1.6037111 all restaurants in +-1.043383 smoking ban in +-0.5246688 are dancing in +-0.5246688 stay awake in +-0.5246688 with excellence in +-0.5246688 explore concepts in +-0.5246688 developing himself in +-0.5246688 high marks in +-0.5246688 great advances in +-0.5246688 the weekend in +-0.5246688 further progress in +-0.6909701 on smoking in +-0.23220915 banning smoking in +-0.6809026 justify interference in +-0.79049295 of non-smokers in +-0.56576264 of smokers in +-0.74859107 that tobacco in +-0.5246688 non-smoking sections in +-0.9669332 in that field +-1.6697625 in a field +-2.3967392 in the field +-1.2595067 or the field +-0.77584326 an unrelated field +-1.2369354 in their field +-1.2369354 to their field +-1.3191022 their chosen field +-0.90895593 fs related field +-0.88577396 students f field +-0.94522953 future -RRB- field +-1.6495156 one 's field +-1.1897564 chosen career field +-0.7038017 their specialized field +-0.54052097 their respective field +-0.84276944 in his\/her field +-1.0679866 the appropriate field +-0.816297 their current field +-1.2032709 studying for and +-1.1891713 to it and +-1.168798 a job and +-0.8649645 their job and +-1.4862049 time job and +-0.8649645 one job and +-1.1209372 is valuable and +-1.7143401 a student and +-1.642169 the student and +-1.5120628 college student and +-1.3668743 upon graduation and +-0.9964963 job , and +-0.7371828 field , and +-0.95045984 studying , and +-1.1274836 students , and +-1.037266 studies , and +-0.7371828 less , and +-1.0961045 time , and +-0.6811532 jobs , and +-0.886555 tired , and +-0.886555 focus , and +-1.0093554 school , and +-1.0594428 work , and +-0.886555 fees , and +-1.0807683 college , and +-0.7046587 money , and +-1.0080934 future , and +-0.39134806 first , and +-0.7371828 spend , and +-0.95045984 food , and +-0.95045984 friends , and +-0.886555 earn , and +-0.7371828 use , and +-0.7371828 specialty , and +-0.7371828 grades , and +-0.7371828 teachers , and +-0.95045984 university , and +-0.886555 night , and +-0.7371828 savings , and +-0.7371828 adult , and +-0.7295443 education , and +-0.98632365 world , and +-0.7371828 multitasking , and +-0.886555 supplies , and +-0.7371828 assignments , and +-0.886555 come , and +-0.7371828 needs , and +-0.7371828 resting , and +-0.7371828 solution , and +-0.7371828 tasks , and +-0.98632365 hours , and +-0.7371828 home , and +-0.886555 responsibility , and +-0.886555 independence , and +-0.7371828 enormous , and +-0.7371828 company , and +-0.7371828 individual , and +-0.7371828 manager , and +-0.95045984 teamwork , and +-0.7371828 dressed , and +-0.7371828 children , and +-0.7371828 spending , and +-0.886555 materials , and +-0.7371828 etiquette , and +-0.7371828 diligence , and +-0.7371828 outlook , and +-0.7371828 nations , and +-0.7371828 smoking , and +-0.7371828 standards , and +-0.7371828 poison , and +-0.89648646 productive employment and +-1.8181026 going to and +-0.937057 commuting to and +-1.4903399 of study and +-0.82286453 students study and +-0.82286453 uninteresting study and +-0.40733 for studying and +-0.78788686 in studying and +-1.0366805 time studying and +-1.375283 work experience and +-0.57957655 life experience and +-0.8451474 invaluable experience and +-0.7174656 time working and +-0.9008978 both working and +-0.7965408 a waiter and +-0.8699095 fast-food restaurant and +-0.91150343 driving there and +-1.5998166 the students and +-1.6071639 , students and +-0.59717906 fellow students and +-0.88586724 irresponsible students and +-0.88586724 needy students and +-0.8192614 creates stress and +-0.9317287 worked on and +-0.9613325 the studies and +-0.7891637 their studies and +-0.9613325 on studies and +-0.78166103 as less and +-0.78166103 doing less and +-1.3695166 of time and +-1.165016 available time and +-0.92728245 spend time and +-0.85555077 requires time and +-0.85555077 manage time and +-1.8734496 time jobs and +-0.8030546 , tired and +-1.1015922 the focus and +-0.77982485 poor quality and +-1.3487858 in school and +-0.81277865 their school and +-0.8082595 high school and +-0.9834833 for work and +-0.9834833 can work and +-1.1546277 part-time work and +-1.0473912 , work and +-1.2552977 to work and +-0.91490877 school work and +-1.1424766 hard work and +-0.91490877 class work and +-0.7570921 hierarchies work and +-0.7570921 respect work and +-0.82776713 level -LRB- and +-0.82776713 enough -LRB- and +-0.8494859 him -RRB- and +-0.8494859 bills -RRB- and +-0.8860644 better financial and +-0.9051738 the tuition and +-0.7503001 my tuition and +-0.4791212 their fees and +-0.55517733 tuition fees and +-0.4791212 tertiary fees and +-1.55037 of college and +-1.4446886 to college and +-0.8976566 than college and +-0.8425031 with money and +-0.39331436 of money and +-1.0061734 extra money and +-0.9315856 need money and +-1.0061734 earn money and +-0.8425031 make money and +-0.7054969 becomes money and +-0.7054969 between money and +-0.8986297 get you and +-0.8404093 me today and +-0.8871715 job market and +-0.6714977 highly competitive and +-0.76517963 a degree and +-0.7220824 is well and +-1.4625627 as well and +-0.7220824 works well and +-0.42256877 the benefits and +-1.5074806 the future and +-0.8312195 carefully consider and +-0.8791983 in academic and +-0.68108815 future ; and +-0.68108815 independence ; and +-0.68108815 expertise ; and +-0.9517022 , skills and +-0.9517022 academic skills and +-0.7823414 increase skills and +-0.7823414 obtain skills and +-0.51808417 team spirit and +-1.2520705 time management and +-1.1196725 go through and +-0.45323932 overall personality and +-0.45323932 inclined personality and +-0.98392856 for themselves and +-0.80389756 manage themselves and +-0.51808417 will expand and +-1.4974552 The first and +-0.8911989 adequate enough and +-1.5271415 to spend and +-0.8663443 buy food and +-1.1186132 is good and +-0.58299506 with others and +-0.7575283 that parents and +-0.9155363 the parents and +-0.66828793 their parents and +-0.5805192 with friends and +-0.500291 new friends and +-0.500291 My friends and +-0.500291 long friends and +-0.7376198 so again and +-0.8848768 from people and +-0.8848768 new people and +-1.6346626 College and +-0.9201956 and when and +-0.6714977 is teaching and +-0.7965408 the theories and +-0.79425 of classes and +-0.79425 my classes and +-0.8502833 all useful and +-0.8961282 fs society and +-0.79888135 working itself and +-0.6837808 their grades and +-0.5843222 good grades and +-1.1232876 club activities and +-1.0413986 the social and +-0.84100085 learn social and +-0.51808417 become unwilling and +-0.9017385 father fs and +-1.1228715 of life and +-0.9693432 social life and +-0.9028055 about life and +-0.9028055 his life and +-0.7486409 everyday life and +-0.6714977 the realities and +-0.84033173 getting up and +-1.1345265 growing up and +-0.79888135 get immediate and +-1.1754589 their personal and +-0.8502833 , finances and +-0.6714977 the levels and +-0.8030546 future debt and +-0.7659851 any income and +-0.7659851 their income and +-0.7376198 people aware and +-0.94073325 financial pressures and +-0.89649045 to living and +-1.017962 living expenses and +-0.8765379 future family and +-0.8269828 become independent and +-0.65755117 more independent and +-0.81564915 negative factor and +-1.3029555 they graduate and +-0.7005967 , professional and +-0.7005967 future professional and +-0.8030546 academic ability and +-0.76990867 with customers and +-0.76990867 non-smoking customers and +-0.8646487 they enjoy and +-0.6714977 is flexible and +-0.8456982 every year and +-0.84361047 of education and +-0.6341634 quality education and +-0.6341634 school education and +-0.98500454 college education and +-0.6341634 my education and +-0.6341634 health education and +-1.4036958 the world and +-1.1378555 academic world and +-1.8525243 the value and +-1.0297275 for learning and +-1.1213851 , learning and +-0.51808417 explore talents and +-0.79888135 new interests and +-1.2249653 a full and +-0.94073325 these factors and +-0.7376198 their energy and +-0.7748844 or assignments and +-0.51808417 job rises and +-0.7965408 work ethic and +-0.51808417 start saving and +-0.8371417 is beneficial and +-0.88646513 precious years and +-0.8871715 , medicine and +-0.51808417 social circles and +-0.8404093 fs meet and +-0.553488 their books and +-0.47770286 needed books and +-0.47770286 purchase books and +-0.51808417 was huge and +-1.4221469 to go and +-0.8030546 adults now and +-0.7077215 different needs and +-0.7077215 basic needs and +-1.1065849 better understand and +-0.7748844 getting higher and +-0.51808417 the momentum and +-0.9805856 study hard and +-0.9805856 very hard and +-0.8030546 to concentrate and +-0.6714977 by active and +-0.641719 the government and +-0.6714977 more sophisticated and +-0.77982485 pecuniary insight and +-0.6554139 help universities and +-0.6554139 based universities and +-0.6714977 , productivity and +-0.51808417 practical manner and +-0.51808417 these issues and +-1.1440036 their families and +-0.8404093 in between and +-0.8947326 buy things and +-0.79888135 do basic and +-0.81564915 the tasks and +-0.6714977 and suffering and +-0.9479905 North America and +-0.97635806 , rent and +-0.7376198 customer service and +-0.8312195 for anything and +-0.8030546 , law and +-0.51808417 , unchallenging and +-0.6714977 vicious circle and +-0.8774438 24 hours and +-1.3033627 college days and +-1.261229 a week and +-0.8779626 this day and +-0.33287477 the home and +-0.33287477 from home and +-0.33287477 leave home and +-0.40496916 at home and +-0.33287477 staying home and +-0.33287477 family home and +-0.77152056 teaches responsibility and +-0.77152056 comes responsibility and +-0.7376198 , lunch and +-1.1491418 financial independence and +-0.6714977 big party and +-0.8030546 , drinking and +-0.5843222 social clubs and +-0.5843222 joining clubs and +-1.0359567 for companies and +-0.7965408 their character and +-0.51808417 on taxpayers and +-1.2061355 to obtain and +-0.76625526 to class and +-0.76625526 my class and +-0.64859325 between class and +-0.51808417 work record and +-0.53042823 to mature and +-0.53042823 are mature and +-0.33894214 the discipline and +-0.33894214 student discipline and +-0.33894214 of discipline and +-0.33894214 greater discipline and +-0.7965408 being organized and +-0.45323932 be diligent and +-0.45323932 is diligent and +-0.8108417 broadens interest and +-0.51808417 or dislikes and +-0.7376198 financial planning and +-0.51808417 the rigors and +-0.45323932 , confidence and +-0.45323932 little confidence and +-0.3308039 for guidance and +-0.3308039 providing guidance and +-0.51808417 writing essays and +-0.51808417 good friend and +-1.3291415 this opinion and +-0.6714977 are concentrated and +-0.8871715 a distraction and +-0.51808417 between childhood and +-0.6714977 can influence and +-0.7376198 conscious country and +-0.8371417 , save and +-0.51808417 for authority and +-0.6714977 schools and +-0.51808417 including exploitation and +-0.51808417 and absenteeism and +-0.7965408 the classroom and +-0.8512615 social balance and +-0.3308039 of organizational and +-0.3308039 fs organizational and +-0.51808417 both presently and +-0.51808417 at interviews and +-0.51808417 society professionally and +-0.5843222 Living together and +-0.5843222 gather together and +-0.6714977 , meetings and +-0.7965408 is costly and +-0.7376198 both personally and +-0.51808417 get certified and +-0.6714977 expectations faster and +-0.8536918 their children and +-0.65755117 longer children and +-0.7376198 local community and +-0.51808417 more maturity and +-0.3308039 often wasted and +-0.3308039 greatly wasted and +-0.51808417 very lazy and +-0.5843222 fs growth and +-0.5843222 persons growth and +-0.7376198 always welcome and +-0.51808417 a well-paying and +-0.7376198 bank loan and +-0.51808417 in fashion and +-0.51808417 and technology and +-0.7376198 a low and +-0.51808417 University tuitions and +-0.51808417 the summers and +-0.51808417 before hiring and +-0.6714977 , dedication and +-0.6714977 Both private and +-0.51808417 for housing and +-0.51808417 their wings and +-0.51808417 are unsure and +-0.7965408 a boss and +-0.51808417 up quicker and +-0.51808417 latest games and +-0.7376198 critical thought and +-0.6714977 so spoiled and +-0.51808417 , courting and +-0.51808417 life style and +-0.51808417 into adulthood and +-0.51808417 balance expenditure and +-0.51808417 sit quietly and +-0.51808417 our feet and +-0.51808417 first paycheck and +-0.51808417 more cautious and +-0.7928865 potential health and +-0.7928865 personal health and +-0.51808417 to exercising and +-0.51808417 their resumes and +-0.51808417 the dormitory and +-0.6714977 the dorm and +-0.51808417 for awareness and +-0.6714977 a dollar and +-0.51808417 avoid drugs and +-0.51808417 married couples and +-0.51808417 material goods and +-0.6714977 getting smaller and +-0.51808417 poker tournament and +-0.51808417 in physics and +-0.7376198 as partying and +-0.98392856 the restaurants and +-0.98392856 at restaurants and +-0.51808417 do housework and +-0.51808417 both wrong-headed and +-0.51808417 be dispensed and +-0.51808417 is holy and +-0.51808417 relentless racism and +-0.51808417 be compromised and +-0.51808417 greater detail and +-0.51808417 , confident and +-0.6714977 long past and +-0.51808417 the maximum and +-0.6714977 get distracted and +-0.7748844 learned here and +-0.51808417 is compounded and +-0.51808417 , beer and +-0.84427166 introduce smoking and +-0.84427166 segregate smoking and +-1.5482439 to smoke and +-1.013883 the rights and +-0.94073325 public places and +-0.7965408 Restaurants and +-0.51808417 , asthma and +-0.51808417 of appetite and +-0.51808417 the warnings and +-0.51808417 like UK and +-0.51808417 the welfare and +-0.6714977 a poison and +-0.51808417 permeates foods and +-0.7965408 the comfort and +-0.51808417 social pastime and +-1.6222497 may be of +-1.6298187 would be of +-1.6023883 is that of +-1.2646338 to that of +-0.90155673 ways that of +-1.1418319 than that of +-1.5758424 the job of +-0.91268986 simple job of +-0.92800885 grades is of +-0.92800885 burger is of +-1.931039 the student of +-1.2170563 not , of +-1.2170563 But , of +-0.94085634 in many of +-0.77496845 on many of +-0.77496845 since many of +-0.77496845 share many of +-0.9371312 many cases of +-0.7724225 innumerous cases of +-0.8981222 enjoy any of +-0.28392494 that field of +-0.33088547 the field of +-0.28392494 unrelated field of +-0.11853396 their field of +-0.28392494 related field of +-0.28392494 f field of +-0.28392494 -RRB- field of +-0.28392494 's field of +-0.28392494 specialized field of +-0.28392494 current field of +-1.9057608 , and of +-0.8059908 The nature of +-1.0220352 the course of +-0.77956766 chosen course of +-1.4490381 a little of +-0.98221105 that experience of +-0.7316363 the experience of +-0.98221105 useful experience of +-0.98221105 actual experience of +-0.06596292 the amount of +-0.09356618 considerable amount of +-0.06596292 small amount of +-0.09356618 significant amount of +-0.09356618 large amount of +-0.09356618 budgeted amount of +-0.8223625 The stress of +-0.3316378 the top of +-0.21343295 on top of +-1.1039206 is less of +-0.99560815 is one of +-0.81157833 also one of +-0.81157833 : one of +-0.31286377 the quality of +-0.5319525 lower quality of +-1.2556734 the work of +-0.9105461 choices -RRB- of +-1.4685214 And of +-0.83977234 for some of +-0.6486388 use some of +-0.6486388 without some of +-0.6486388 just some of +-0.6486388 practice some of +-0.6486388 lose some of +-1.368325 do so of +-1.149994 , most of +-1.149994 in most of +-0.76830566 spend most of +-1.2851068 a degree of +-1.1922117 Most of +-0.43534195 this level of +-0.43534195 a level of +-0.26778597 the level of +-0.9226535 university are of +-0.7779218 have ways of +-0.8437351 potential benefits of +-0.8535203 added benefit of +-1.4332008 a part of +-0.55063707 is part of +-0.89469254 the part of +-0.55063707 being part of +-0.3209676 important part of +-0.55063707 consume part of +-0.55063707 essential part of +-0.55063707 indispensable part of +-0.55063707 indeed part of +-0.89912343 the foundation of +-0.9101094 know more of +-0.8410733 cash potential of +-0.48269886 an understanding of +-0.48269886 's understanding of +-0.21790503 better understanding of +-0.83997875 the management of +-0.44111878 the knowledge of +-0.381484 more knowledge of +-0.381484 my knowledge of +-0.381484 REAL knowledge of +-1.0950018 the environment of +-0.7825354 continual increase of +-1.4842088 the first of +-0.57727504 the issue of +-1.4947908 Many of +-1.2481797 not enough of +-0.9050274 no need of +-0.89184976 the good of +-0.20485121 the idea of +-0.3059415 little idea of +-0.3059415 an idea of +-0.3059415 better idea of +-0.3059415 fs idea of +-0.3059415 clear idea of +-1.5319782 their parents of +-0.8406158 If parents of +-1.156758 the chance of +-0.8592908 best use of +-1.2729907 I could of +-0.7404067 to opportunities of +-1.0728552 for all of +-1.0549766 of all of +-0.7609632 or all of +-0.9204871 spend all of +-0.7609632 complete all of +-1.0391383 college because of +-0.8395745 places because of +-0.3316378 the range of +-0.3316378 broad range of +-0.31286377 their area of +-0.5319525 fs area of +-0.8911092 their subject of +-0.3316378 another source of +-0.3316378 our source of +-1.1290729 in activities of +-0.51976305 is unworthy of +-0.5484505 the demands of +-0.33365712 a lot of +-0.88965493 future life of +-0.88965493 boring life of +-0.6738899 mundane realities of +-0.989523 the responsibilities of +-0.76066804 and responsibilities of +-0.6860101 a result of +-0.58609426 The result of +-0.6738899 the responsibly of +-0.6738899 the reality of +-0.6738899 high levels of +-0.7404067 am aware of +-0.8410733 cover expenses of +-0.51976305 a combination of +-1.2162403 a family of +-0.8059908 critical period of +-0.38230968 for development of +-0.24076203 the development of +-0.38230968 all-round development of +-0.62947315 a way of +-0.5405912 valuable way of +-0.5405912 or way of +-0.5405912 constructive way of +-1.0357691 financially independent of +-0.16369317 the majority of +-0.3316378 large majority of +-0.7640125 the ability of +-0.21298836 many aspects of +-0.21298836 other aspects of +-0.21298836 what aspects of +-0.51976305 to expect of +-0.2468743 a sense of +-0.28955266 better sense of +-0.28955266 growing sense of +-0.28955266 false sense of +-0.21298836 full-time member of +-0.21298836 -RRB- member of +-0.14377046 contributing member of +-0.8486773 their year of +-0.3316378 the type of +-0.21343295 any type of +-0.76456714 the world of +-0.8592908 the success of +-0.5319525 the burden of +-0.5319525 financial burden of +-0.3316378 the concept of +-0.21343295 little concept of +-0.045771595 the cost of +-0.18059649 The cost of +-0.18059649 true cost of +-0.18059649 actual cost of +-0.18059649 full cost of +-0.26207307 the importance of +-0.65955615 The importance of +-0.4941905 and out of +-0.4941905 most out of +-0.4941905 coming out of +-0.4941905 move out of +-0.4941905 fresh out of +-0.4941905 dropped out of +-0.4941905 drop out of +-0.07908349 the value of +-0.5160295 true value of +-0.5995218 real value of +-0.8020919 the interests of +-0.6738899 full extent of +-0.8286453 the impact of +-0.51976305 resulting reduction of +-0.51976305 the relevance of +-0.6738899 true determination of +-0.51976305 the particulars of +-0.7404067 all areas of +-1.2846382 Some of +-0.68666714 from those of +-0.68666714 than those of +-0.68666714 behind those of +-0.34126893 final years of +-0.7030647 four years of +-0.59958875 16 years of +-0.59958875 45 years of +-0.31008765 a taste of +-0.20136651 the taste of +-0.31008765 first taste of +-0.83997875 Independent means of +-0.27543268 student outside of +-0.27543268 jobs outside of +-0.27543268 people outside of +-0.27543268 living outside of +-0.27543268 live outside of +-0.27543268 opinions outside of +-0.7779218 full load of +-0.83605236 universities much of +-0.83605236 stem much of +-0.7825354 efficient performance of +-0.51976305 last weeks of +-0.47066337 the costs of +-0.48025998 and costs of +-0.48025998 increasing costs of +-0.7404067 a matter of +-0.22786619 the lack of +-0.22786619 to lack of +-0.22786619 The lack of +-0.22786619 or lack of +-0.7460558 the process of +-0.7779218 whole practice of +-1.1831393 not always of +-0.7779218 severely short of +-1.0263747 the problem of +-0.8223625 immediate effect of +-0.8747623 future workplace of +-0.28955266 working instead of +-0.28955266 earning instead of +-0.28955266 education instead of +-0.28955266 tips instead of +-0.7779218 the concern of +-0.51976305 little bit of +-0.51976305 the confines of +-0.14793482 the rest of +-0.8640698 the lives of +-0.7404067 a sample of +-0.51976305 the delights of +-0.6738899 next round of +-1.4986086 Students of +-0.8020919 the rent of +-1.0687017 the freedom of +-0.33968225 the purpose of +-0.3934904 main purpose of +-0.33968225 sole purpose of +-0.33968225 entire purpose of +-0.65955615 real waste of +-0.65955615 complete waste of +-0.6738899 your circle of +-0.88032144 25 hours of +-1.27016 a week of +-0.8810427 Their day of +-0.27713037 the risk of +-0.45445842 at risk of +-0.6738899 added frustration of +-1.4146034 First of +-0.4030567 the responsibility of +-0.9393304 and responsibility of +-0.3316378 my list of +-0.3316378 aforementioned list of +-0.51976305 personal satisfaction of +-0.8020919 the University of +-0.84641343 a point of +-0.51976305 a backdrop of +-0.51976305 of lots of +-0.6738899 practical applications of +-0.6738899 more background of +-0.14918531 productive members of +-0.14918531 all members of +-0.14918531 adult members of +-0.14918531 Family members of +-0.95198864 the expectations of +-0.51976305 the rules of +-0.51976305 wider array of +-0.7825354 diminished possibilities of +-0.14918531 a kind of +-0.14918531 some kind of +-0.14918531 are kind of +-0.10312651 what kind of +-0.51976305 and dynamics of +-0.6738899 either side of +-0.7404067 a manager of +-0.3316378 to loss of +-0.3316378 causes loss of +-0.51976305 the danger of +-0.7404067 main causes of +-0.51976305 the love of +-0.51976305 the destiny of +-0.6738899 all institutions of +-0.9514858 the right of +-0.5259497 the youth of +-0.45445842 The youth of +-0.6738899 the virtues of +-0.51976305 and abuse of +-0.51976305 gross negligence of +-0.6738899 main purposes of +-0.3316378 lose sight of +-0.3316378 complete sight of +-0.51976305 four corners of +-0.85488534 the balance of +-0.799734 the decision of +-0.51976305 the attainment of +-0.6738899 consist mainly of +-0.799734 the challenges of +-0.51976305 in favor of +-0.16369317 a variety of +-0.3316378 wider variety of +-0.22786619 valuable form of +-0.22786619 the form of +-0.22786619 some form of +-0.22786619 other form of +-0.51976305 than capable of +-0.3316378 an appreciation of +-0.3316378 Early appreciation of +-0.51976305 long length of +-0.6738899 the path of +-0.51976305 the content of +-0.7404067 high expense of +-0.91485167 the pressure of +-0.30747035 large percentage of +-0.51976305 take notice of +-0.9811984 primary goal of +-0.21343295 take care of +-0.3316378 taken care of +-0.10312651 a number of +-0.14918531 the number of +-0.14918531 large number of +-0.10312651 total number of +-0.51976305 a dime of +-0.30747035 the requirements of +-0.51976305 the failures of +-0.51976305 and undecided of +-0.30747035 the direction of +-0.7404067 Most kids of +-0.51976305 in possession of +-0.3316378 receiving thousands of +-0.3316378 kills thousands of +-0.51976305 thought-controlling atmosphere of +-0.51976305 recommended norms of +-0.51976305 extended periods of +-0.51976305 repeated bouts of +-0.30747035 One of +-0.6188963 the efforts of +-0.5319525 their efforts of +-0.51976305 obtain control of +-0.51976305 the security of +-0.3316378 encourage thoughts of +-0.3316378 explore thoughts of +-0.51976305 the joys of +-0.51976305 the pain of +-0.51976305 THEIR interpretation of +-0.51976305 all walks of +-0.51976305 the myriad of +-0.3316378 small amounts of +-0.3316378 outrageous amounts of +-0.22941051 in terms of +-0.6738899 the beginning of +-0.51976305 the impression of +-0.51976305 peculiar sort of +-0.51976305 the detriment of +-0.28574196 the health of +-0.55685854 and health of +-0.51976305 new phase of +-0.5319525 social ideas of +-0.5319525 main ideas of +-0.51976305 the remainder of +-0.3316378 the pursuit of +-0.3316378 his pursuit of +-0.51976305 Any forms of +-0.51976305 do dropout of +-0.51976305 the image of +-0.51976305 the acquisition of +-0.51976305 great dollops of +-0.6738899 a minimum of +-0.51976305 obscene perversion of +-0.6738899 academic attention of +-0.51976305 on behalf of +-0.799734 that none of +-0.51976305 the foremost of +-0.51976305 overall efficacy of +-0.30747035 be plenty of +-0.51976305 the entrance of +-0.51976305 to feelings of +-0.6738899 , regardless of +-0.51976305 health concerns of +-0.51976305 worst sufferers of +-0.14918531 the effects of +-0.14918531 health effects of +-0.14918531 ill effects of +-0.14918531 harmful effects of +-0.51976305 makes hundreds of +-0.6738899 the interference of +-0.51976305 and parts of +-0.51976305 restaurants despite of +-0.51976305 means infringement of +-0.51976305 The smell of +-0.799734 the comfort of +-2.3284578 of a nature +-1.6716202 , by nature +-0.947483 its very nature +-2.3051038 The nature +-0.88792825 our true nature +-2.6966972 I completely +-2.0467944 that is completely +-1.2669065 employment in completely +-0.9696175 party and completely +-1.4509012 I am completely +-0.9246814 -LRB- without completely +-2.7458208 part-time job unrelated +-0.86376655 is completely unrelated +-1.2487434 in an unrelated +-0.8768929 more reasons to +-0.97420084 main reasons to +-1.0356766 three reasons to +-0.73031056 excellent reasons to +-0.88755226 leave it to +-1.1177621 do it to +-1.1177621 make it to +-1.7590013 should be to +-1.1677622 would be to +-0.5588796 with having to +-0.8031702 , having to +-0.5588796 by having to +-0.7073471 and having to +-0.40460834 of having to +-0.32447994 without having to +-1.6601943 a job to +-1.9133228 part-time job to +-1.7755494 time job to +-1.3674722 student is to +-1.3682886 , is to +-1.1576488 restaurant is to +-1.4218755 This is to +-1.2569262 what is to +-0.85192233 area is to +-1.2569262 life is to +-1.3384877 which is to +-0.85192233 goal is to +-0.88501513 be valuable to +-0.8104003 this student to +-0.76637393 a student to +-1.1891465 the student to +-1.1092662 college student to +-0.8104003 cloistered student to +-0.85291755 after graduation to +-0.9086295 experience , to +-1.6591724 students , to +-1.4943287 studies , to +-1.561148 work , to +-1.1542058 i.e. , to +-0.9086295 earned , to +-0.9086295 partially , to +-1.3619043 second , to +-0.9086295 bed , to +-1.27854 is not to +-1.2191526 , not to +-0.999246 important not to +-0.81395614 people not to +-0.81395614 best not to +-0.81395614 growth not to +-0.81541586 from case to +-0.9365024 worlds in to +-1.5605719 , and to +-1.0882597 themselves and to +-0.6996763 friends and to +-1.0882597 customers and to +-1.0882597 needs and to +-0.8699467 days and to +-1.336546 home and to +-1.0882597 diligent and to +-0.8699467 housing and to +-0.45172024 job unrelated to +-0.45172024 completely unrelated to +-0.8272194 have chosen to +-0.64961874 reason has to +-0.7676003 one has to +-0.7676003 who has to +-0.64961874 world has to +-0.64961874 someone has to +-0.68331456 valuable experience to +-1.043413 and experience to +-0.84226954 hand experience to +-1.4453467 from working to +-0.89786196 only working to +-0.91058034 student as to +-0.91058034 far as to +-0.435649 Another reason to +-0.27475208 for students to +-1.1764132 the students to +-0.41260415 college students to +-0.39066103 helps students to +-1.0331011 help students to +-0.73509085 encourage students to +-0.38775066 all students to +-0.73509085 allowing students to +-0.73509085 encourages students to +-0.8836072 expose students to +-0.73509085 requires students to +-0.6160119 allows students to +-0.73509085 enables students to +-0.73509085 want students to +-0.8836072 assist students to +-0.73509085 pushing students to +-0.73509085 inspire students to +-0.903169 and also to +-0.87127405 much less to +-0.67386734 the time to +-1.0545759 their time to +-0.73068815 ample time to +-0.8774222 less time to +-0.38920784 have time to +-0.73068815 or time to +-0.67605966 more time to +-0.38920784 enough time to +-0.73068815 take time to +-1.0216594 fs time to +-0.73068815 find time to +-0.73068815 excellent time to +-0.44812647 is available to +-0.27405852 time available to +-0.44812647 jobs available to +-0.44812647 Jobs available to +-1.5959704 part-time jobs to +-1.5787758 time jobs to +-0.84952116 offering jobs to +-1.3077376 for one to +-0.1485141 cases unable to +-0.1485141 and unable to +-0.1485141 thus unable to +-0.1485141 are unable to +-1.3844416 to work to +-0.8926488 sufficient work to +-0.8926488 generally work to +-0.13344447 be related to +-0.04255451 job related to +-0.09276951 is related to +-0.13344447 not related to +-0.13344447 work related to +-0.13344447 are related to +-0.13344447 all related to +-0.09276951 directly related to +-0.13344447 responsibility related to +-0.8464918 enough -RRB- to +-1.0501419 etc. -RRB- to +-0.40692312 , only to +-0.78654397 money only to +-0.78654397 history only to +-0.721105 is necessary to +-0.76848745 skills necessary to +-0.80291003 can have to +-0.70640576 will have to +-0.85491836 and have to +-0.81340414 students have to +-0.9015334 may have to +-0.80291003 only have to +-0.9726623 they have to +-0.67626476 skills have to +-0.9015334 who have to +-0.5765109 ft have to +-0.67626476 people have to +-0.8834614 would have to +-0.3704176 we have to +-0.67626476 thereby have to +-0.67626476 woman have to +-1.1303846 the money to +-0.84011203 their money to +-0.7037515 little money to +-1.002963 extra money to +-0.8970901 enough money to +-0.9286521 need money to +-1.002963 earn money to +-0.9771276 save money to +-0.7711173 society - to +-0.43292513 will begin to +-0.43292513 students begin to +-0.43292513 likely begin to +-0.84234536 or to +-1.4013506 , or to +-0.84234536 study or to +-0.84234536 tuition or to +-1.1252277 do well to +-0.66852593 may appear to +-1.7120872 students are to +-0.7711173 several ways to +-0.3708232 several benefits to +-0.6773939 many benefits to +-0.93840593 your part to +-0.32976374 In order to +-0.038731158 in order to +-0.8777381 school ; to +-0.4388467 contribute more to +-0.51599467 more attractive to +-0.67953897 employment helps to +-0.67953897 industry helps to +-1.1697705 their skills to +-0.6227832 it important to +-1.0211118 is important to +-1.0651363 very important to +-0.35042372 's important to +-0.8002564 most important to +-0.7327168 are important to +-0.6227832 skills important to +-0.7327168 all important to +-0.004128754 be able to +-0.042674154 is able to +-0.059844032 not able to +-0.059844032 then able to +-0.059844032 only able to +-0.034184407 are able to +-0.059844032 were able to +-0.059844032 was able to +-0.042674154 Being able to +-0.80045015 dedicating themselves to +-0.80045015 exposing themselves to +-1.4591602 the first to +-0.77258974 not enough to +-0.61999524 earn enough to +-0.61999524 fortunate enough to +-0.34933895 old enough to +-1.2412258 the food to +-1.5123683 Having to +-0.5151397 the need to +-0.4452719 will need to +-0.4452719 often need to +-0.4452719 and need to +-0.26824608 students need to +-0.4452719 force need to +-0.4452719 essentially need to +-0.8490858 on others to +-0.8705675 fd like to +-1.030891 the parents to +-1.5126449 their parents to +-1.1605529 young people to +-1.0143895 to see to +-0.81151307 and effort to +-0.66852593 as keeping to +-0.32976374 are willing to +-0.32976374 more willing to +-0.32976374 , trying to +-0.16294363 are trying to +-0.08911633 a chance to +-0.34841016 the chance to +-0.43292513 best chance to +-0.7163103 little use to +-0.7163103 normally use to +-1.3515105 and what to +-0.91064435 them all to +-0.20727918 the opportunity to +-0.45028245 The opportunity to +-0.40964338 an opportunity to +-0.45028245 no opportunity to +-0.45282257 for them to +-0.59915435 encourage them to +-0.59915435 allowing them to +-0.59915435 force them to +-0.13313693 allows them to +-0.59915435 enables them to +-0.7025134 preparing them to +-0.59915435 cause them to +-0.59915435 encouraging them to +-0.59915435 guide them to +-0.87067914 young Japanese to +-0.69958204 very useful to +-0.69958204 extremely useful to +-0.14312786 be expected to +-0.2119523 not expected to +-0.2119523 are expected to +-0.94553316 Learning to +-0.51599467 's external to +-0.86245567 I get to +-0.86245567 They get to +-0.73416126 areas relevant to +-0.81151307 the colleges to +-1.1161627 club activities to +-0.66852593 are sufficient to +-0.35216606 , how to +-0.3029759 in how to +-0.3029759 students how to +-0.3029759 you how to +-0.3029759 them how to +-0.15201639 learning how to +-0.15201639 learn how to +-0.3029759 understand how to +-0.3029759 taught how to +-1.2948811 a lot to +-1.1705978 school life to +-0.45172024 workforce prior to +-0.45172024 exists prior to +-0.8852968 onto university to +-0.43292513 become used to +-0.43292513 well used to +-0.43292513 are used to +-0.7593176 is up to +-0.7593176 entirely up to +-0.7593176 add up to +-0.17974027 it difficult to +-0.31826448 be difficult to +-0.17974027 is difficult to +-0.31826448 often difficult to +-0.27258074 been difficult to +-0.27258074 sometimes difficult to +-1.505216 to make to +-0.52853036 with regard to +-0.52853036 With regard to +-0.81541586 on loans to +-0.8665724 additional income to +-0.9060262 with which to +-0.9358045 a way to +-0.77151406 best way to +-0.06702199 can lead to +-0.28935957 , lead to +-0.19569385 may lead to +-0.28935957 ultimately lead to +-0.30578586 with regards to +-0.81151307 important factor to +-0.41493955 the ability to +-0.58211726 their ability to +-0.8719748 restaurant customers to +-1.1613337 a sense to +-0.66852593 allowing businesses to +-0.51599467 In contrast to +-0.91301733 from education to +-1.2499849 academic world to +-0.111225836 , going to +-0.16163372 not going to +-0.111225836 and going to +-0.16163372 as going to +-0.19659074 time going to +-0.16163372 : going to +-0.16163372 are going to +-0.16163372 when going to +-0.111225836 while going to +-0.16163372 always going to +-0.16163372 Balancing going to +-0.66852593 training ground to +-0.81151307 herself entirely to +-0.73416126 means '' to +-0.91985255 a place to +-0.6773939 another place to +-0.8306861 paramount importance to +-1.1322751 go out to +-0.8923176 of where to +-0.84514 educational value to +-1.0127025 the transition to +-0.7994119 college campus to +-0.7037232 is learning to +-0.74459434 , learning to +-0.7037232 and learning to +-0.60010767 as learning to +-0.60010767 been learning to +-0.66852593 colleges continue to +-0.79490125 and interests to +-0.66852593 may translate to +-0.66852593 The answer to +-0.66852593 in question to +-0.7764603 not limited to +-0.51599467 Considerations pertaining to +-0.51599467 in relation to +-0.32976374 -LRB- relative to +-0.32976374 important relative to +-0.80637354 In addition to +-0.31379578 in addition to +-0.66852593 they stand to +-0.61471933 and decide to +-0.52853036 they decide to +-0.44526792 will start to +-0.5411668 they start to +-0.44526792 we start to +-0.44526792 smooth start to +-0.71815115 will learn to +-0.6114328 , learn to +-0.6532639 to learn to +-0.76047236 also learn to +-0.6114328 should learn to +-0.76047236 must learn to +-0.6573871 financial lessons to +-0.6573871 those lessons to +-0.73416126 country areas to +-0.73416126 business contacts to +-0.51599467 student intends to +-0.88146377 college than to +-0.6975849 it just to +-0.6975849 income just to +-0.5428381 , possible to +-0.5428381 quite possible to +-0.5428381 always possible to +-0.882481 for years to +-0.8822992 a lesson to +-0.47770196 students come to +-0.47770196 who come to +-0.47770196 would come to +-0.79490125 n't manage to +-0.42986262 it means to +-0.47616065 the means to +-0.47616065 have means to +-1.308102 too much to +-0.496297 can go to +-0.496297 not go to +-0.4222064 to go to +-0.496297 students go to +-0.4201389 student needs to +-0.5442356 cases needs to +-0.5442356 Japan needs to +-0.7717023 not afford to +-0.6527414 really afford to +-0.51599467 or daughter to +-0.792582 or herself to +-1.0165179 of getting to +-0.7764021 about getting to +-0.58801734 is hard to +-0.58801734 studying hard to +-0.58801734 work hard to +-0.58801734 becomes hard to +-0.58211726 are back to +-0.58211726 giving back to +-0.66852593 necessary power to +-0.51599467 time dedicated to +-1.0922414 the government to +-0.66852593 In comparison to +-0.82269174 younger adults to +-0.32976374 They tend to +-0.32976374 adults tend to +-0.58211726 less likely to +-0.33417666 are likely to +-0.8406946 wealthy families to +-0.35784194 often want to +-0.35784194 students want to +-0.30015707 they want to +-0.35784194 who want to +-0.35784194 ft want to +-0.22774912 might want to +-0.52853036 should try to +-0.52853036 who try to +-0.51599467 previously unavailable to +-1.0014844 it takes to +-0.7711173 much harder to +-0.8394692 the freedom to +-0.7032818 , freedom to +-0.7711173 only serve to +-0.51599467 be encouraged to +-0.73416126 was fortunate to +-0.7764603 have wisely to +-1.2503176 a week to +-0.66852593 and ready to +-0.51599467 is bound to +-0.51599467 be tempting to +-0.51599467 be urged to +-0.51599467 dependent child to +-0.876392 back home to +-0.76853204 social responsibility to +-0.76853204 certainly responsibility to +-0.66852593 all seem to +-0.85648745 from person to +-0.89154303 that adding to +-1.0291091 for companies to +-0.66852593 too easy to +-0.66852593 coined wisdom to +-0.51599467 a network to +-0.51599467 up needing to +-0.66852593 and references to +-1.1704624 an individual to +-0.66852593 eventually wants to +-0.51599467 and attentive to +-0.7764603 other resources to +-0.7711173 ggreen h to +-0.7711173 is needed to +-0.51599467 they plan to +-0.7711173 contributing further to +-0.51599467 mental stability to +-1.0143895 for her to +-0.3085336 it allowed to +-0.26086792 be allowed to +-0.3085336 are allowed to +-0.64097995 a right to +-0.89631003 the right to +-0.64097995 their right to +-0.51599467 feel obliged to +-0.51599467 duties assigned to +-0.7764603 with respect to +-0.51599467 policy measures to +-0.792582 the decision to +-0.51599467 real consequences to +-0.51599467 much closer to +-0.51599467 disadvantage compared to +-0.51599467 award credits to +-1.1692687 their children to +-0.5800439 to choose to +-0.47616065 they choose to +-0.47616065 who choose to +-0.47059247 and had to +-0.47059247 college had to +-0.47059247 they had to +-0.47059247 myself had to +-0.47059247 someone had to +-0.58487517 is required to +-0.58487517 effort required to +-0.51599467 very surprised to +-0.51599467 first came to +-0.66852593 they bring to +-0.7711173 devoted purely to +-0.7711173 is given to +-0.51599467 that relates to +-0.51599467 not suited to +-0.7994119 should look to +-0.79490125 wise choice to +-0.51599467 research assistant to +-0.51599467 scenario leads to +-0.66852593 is merit to +-0.51599467 the capital to +-0.6975849 allow me to +-0.6975849 enabled me to +-0.51599467 various task to +-0.51599467 Looking to +-0.3676195 who goes to +-0.51599467 time adjusting to +-0.79490125 restaurant employees to +-0.51599467 thereby adjust to +-0.66852593 their utmost to +-0.51599467 to prove to +-0.73416126 money left to +-0.73416126 is finished to +-0.66852593 upon ourselves to +-0.73416126 their kids to +-0.4555147 I wish to +-0.1543163 they wish to +-0.3085336 who wish to +-0.51599467 the key to +-0.51599467 , leading to +-0.51599467 people accustomed to +-0.51599467 welcome shock to +-0.51599467 habits deleterious to +-0.51599467 obsessive devotion to +-0.51599467 own disciplines to +-0.51599467 from dependence to +-0.45172024 and begins to +-0.27580485 he begins to +-0.51599467 with whom to +-0.66852593 or women to +-0.66852593 not applied to +-0.66852593 it comes to +-0.66852593 have nothing to +-0.51599467 , traveling to +-0.1485141 -LRB- due to +-0.1485141 college due to +-0.1485141 relief due to +-0.1485141 unread due to +-0.51599467 been shown to +-0.51599467 has similarities to +-0.51599467 less apt to +-0.51599467 is supplementary to +-0.51599467 and hopes to +-0.32976374 virtually impossible to +-0.32976374 near impossible to +-0.51599467 This applies to +-0.51599467 are introduced to +-0.66852593 be offered to +-0.2119523 being exposed to +-0.2119523 are exposed to +-0.2119523 when exposed to +-0.73416126 the 40 to +-0.51599467 are thrown to +-0.66852593 are distracting to +-0.51599467 student decides to +-0.51599467 is conducive to +-0.74071324 boils down to +-0.51599467 more attracted to +-0.51599467 and continues to +-0.51599467 have access to +-1.4212176 in restaurants to +-0.8822992 a man to +-0.66852593 undivided attention to +-0.51599467 being asked to +-0.51599467 not pertain to +-0.51599467 is admitted to +-0.51599467 time commuting to +-0.32976374 as opposed to +-0.32976374 am opposed to +-0.51599467 be safe-guarded to +-0.51599467 a hurry to +-0.51599467 robbing Peter to +-0.7711173 opposite approach to +-0.66852593 money aside to +-0.51599467 certainly advisable to +-0.7711173 their rights to +-0.51599467 legally compelled to +-0.51599467 some harm to +-0.66852593 moral justification to +-0.51599467 the campaigns to +-0.51599467 more prone to +-0.32976374 unwanted exposure to +-0.32976374 Regular exposure to +-0.30578586 anywhere close to +-0.51599467 react violently to +-0.51599467 an insult to +-0.21239409 be forced to +-0.32976374 are forced to +-0.871818 in with their +-1.0052555 students with their +-0.93343186 help with their +-0.3878774 interfere with their +-0.7266834 activities with their +-0.7266834 living with their +-0.7266834 responsible with their +-0.7266834 organized with their +-0.7266834 along with their +-0.7266834 spent with their +-0.7266834 interact with their +-0.8343153 study for their +-1.12276 only for their +-1.12276 idea for their +-1.318425 them for their +-0.8343153 used for their +-0.8343153 best for their +-0.8343153 material for their +-0.8343153 concern for their +-1.0308405 things for their +-0.8343153 rewarding for their +-0.8343153 room for their +-1.24111 it that their +-1.7124686 time is their +-1.226466 careers is their +-0.9537364 condition , their +-0.9537364 Otherwise , their +-0.9537364 regardless , their +-0.7253599 of by their +-0.7253599 created by their +-0.7253599 or by their +-0.7253599 determined by their +-0.7253599 quit by their +-0.7253599 funded by their +-1.1893269 job in their +-1.0488584 and in their +-1.1033635 experience in their +-1.1825448 working in their +-1.010111 jobs in their +-0.9870401 them in their +-0.7375863 done in their +-0.8871243 period in their +-0.7375863 stake in their +-0.7375863 importance in their +-0.39148036 learn in their +-0.7375863 change in their +-0.7375863 quickly in their +-0.7375863 day in their +-0.8871243 person in their +-0.8871243 early in their +-0.9511195 interest in their +-0.7375863 confidence in their +-1.0333644 later in their +-0.7375863 distraction in their +-0.7375863 stage in their +-0.7375863 excel in their +-0.7375863 behind in their +-0.7375863 marks in their +-0.7375863 advances in their +-1.5383627 job and their +-1.3075469 student and their +-1.3939261 school and their +-1.3075469 well and their +-1.1737754 grades and their +-0.9196386 taxpayers and their +-0.7900746 of their +-1.0851387 that of their +-1.0405678 quality of their +-1.0916538 most of their +-1.0851387 knowledge of their +-0.7900746 use of their +-0.7220074 all of their +-0.7900746 period of their +-0.7900746 independent of their +-1.0405678 aspects of their +-0.42373347 cost of their +-0.82002604 out of their +-0.7900746 reduction of their +-0.7900746 particulars of their +-1.0405678 those of their +-1.1142563 years of their +-1.1348124 outside of their +-0.28777322 rest of their +-0.7900746 expectations of their +-0.7900746 negligence of their +-0.7900746 dime of their +-0.7900746 undecided of their +-0.40799108 direction of their +-0.7900746 detriment of their +-0.7900746 remainder of their +-0.7900746 behalf of their +-1.2071253 it to their +-1.2742101 experience to their +-0.50997126 related to their +-1.2742101 or to their +-1.0977969 more to their +-0.8756907 external to their +-0.8756907 relevant to their +-1.0977969 prior to their +-1.0977969 regards to their +-1.1398947 going to their +-0.8756907 transition to their +-0.8756907 question to their +-0.7110288 addition to their +-1.0977969 back to their +-0.8756907 dedicated to their +-0.8756907 home to their +-0.8756907 relates to their +-1.2742101 due to their +-0.8756907 harm to their +-0.9344526 those studying their +-0.88554215 people from their +-1.0650632 away from their +-0.88554215 learning from their +-0.7364645 exclusively from their +-0.7364645 home from their +-0.7364645 independence from their +-0.7364645 break from their +-0.7364645 distract from their +-0.9519054 studies as their +-1.6034954 young students their +-0.34274846 focus on their +-0.6310951 effectively on their +-0.6310951 more on their +-0.6310951 spend on their +-0.6310951 activities on their +-0.3536326 relying on their +-0.7434528 concentrate on their +-0.6310951 participation on their +-0.7434528 effect on their +-0.6310951 dependent on their +-0.6310951 strictly on their +-0.78844887 rely on their +-0.2600605 concentrating on their +-0.6310951 priority on their +-0.6310951 purely on their +-0.3536326 pressure on their +-0.6310951 difficulties on their +-1.5796918 but also their +-0.95285255 America work their +-1.5468633 Even if their +-0.8452257 have pursuing their +-0.9093543 n't pay their +-2.1311002 to have their +-1.6433324 students have their +-0.914435 always have their +-0.53273916 and select their +-0.9180421 society through their +-0.9285632 input into their +-1.3843775 to spend their +-1.0724907 students spend their +-0.91536397 to put their +-0.8996403 students when their +-1.1385047 better when their +-0.9413828 should do their +-0.42911297 to use their +-0.89070636 They know their +-0.936485 destroying what their +-0.93577343 all after their +-0.8273003 should encourage their +-1.2022369 for all their +-0.8192704 focus all their +-1.0074146 spend all their +-0.8192704 use all their +-1.8714768 If their +-1.4199535 they get their +-1.1215585 them get their +-0.9198701 hand how their +-0.87201214 before entering their +-0.53273916 , neglecting their +-1.675651 to make their +-0.38769767 is managing their +-0.24358147 , managing their +-0.4482546 in managing their +-0.73820513 working during their +-0.6270381 time during their +-0.6270381 worked during their +-0.6270381 hard during their +-0.8710371 be earning their +-1.444735 to enjoy their +-0.8017274 , balancing their +-1.0198613 will improve their +-0.92660564 test out their +-0.9187945 learning about their +-0.8584982 from either their +-1.1703831 to support their +-0.81577957 fully support their +-0.7621919 must keep their +-0.9222623 ; building their +-0.53273916 to divide their +-0.6751038 not let their +-0.6751038 to let their +-0.8584982 should quit their +-0.69251406 than sacrifice their +-0.8933335 money before their +-0.86987996 graduates enter their +-0.69251406 may hinder their +-0.8584982 diligent until their +-0.53273916 afford maintaining their +-0.907919 though getting their +-0.6751038 will change their +-0.6751038 quickly change their +-0.53273916 whilst furthering their +-0.86987996 balance between their +-0.53273916 to dip their +-0.8621682 will appreciate their +-0.6751038 completely waste their +-0.6751038 than waste their +-1.0547684 to reduce their +-0.7621919 and reach their +-0.7621919 making sure their +-1.0615355 to build their +-0.69251406 of securing their +-0.9222623 to ask their +-0.7621919 experience planning their +-0.69251406 consequently damage their +-0.53273916 and augment their +-0.69251406 -RRB- finish their +-0.8584982 eventually lose their +-1.2939311 to balance their +-0.84814715 can assist their +-0.8355508 , complete their +-0.70041466 to complete their +-0.70041466 students develop their +-0.70041466 also develop their +-0.8017274 and pass their +-0.8017274 count towards their +-0.7621919 and receive their +-0.53273916 to raise their +-0.8017274 are given their +-0.8017274 as using their +-0.82897985 are within their +-0.53273916 even mortgaging their +-0.53273916 local shops their +-0.7621919 upon leaving their +-0.53273916 by assessing their +-0.53273916 to spread their +-0.8017274 , finally their +-0.53273916 Throughout their +-0.53273916 help relieve their +-0.53273916 of funding their +-0.69251406 and beginning their +-0.53273916 to sharpen their +-0.53273916 essentially shape their +-0.9222623 is nonetheless their +-0.53273916 to compromise their +-0.53273916 choices regarding their +-2.8066833 of the chosen +-1.4422953 in their chosen +-1.9751806 of their chosen +-0.931002 about their chosen +-0.931002 beginning their chosen +-2.1864977 they have chosen +-2.4501126 of the course +-2.3479283 to the course +-1.5325897 during the course +-0.44819883 , of course +-0.94075143 And of course +-0.94075143 so of course +-1.3610553 type of course +-0.94075143 always of course +-1.3230749 their chosen course +-0.86294854 about whether course +-0.3781252 Of course +-0.54122734 the core course +-0.7048308 when setting course +-0.84159017 a heavy course +-0.96567583 materials for study +-0.96684366 balancing a study +-0.8986884 and full-time study +-1.6525701 classes , study +-2.0523765 do not study +-1.877816 work and study +-0.9612025 quietly and study +-0.21680178 field of study +-1.1792585 course of study +-0.6122932 area of study +-0.9226842 subject of study +-0.9226842 year of study +-1.4659714 years of study +-0.9226842 hours of study +-1.2964091 be to study +-1.571146 is to study +-0.9150448 chosen to study +-2.0041645 students to study +-0.9150448 less to study +-1.2613357 time to study +-1.1655626 themselves to study +-1.1655626 use to study +-1.6930503 going to study +-0.9150448 areas to study +-0.9150448 encouraged to study +-0.9150448 purely to study +-1.3643526 , their study +-2.0715368 of their study +-1.9694728 to their study +-1.6395843 of students study +-0.922965 to less study +-0.9007268 the available study +-1.223827 they only study +-0.9417805 classes or study +-0.9417805 expenses or study +-0.9255283 with academic study +-1.7478527 a good study +-1.4363996 and doing study +-1.2405678 make them study +-1.2047782 people must study +-0.7743107 Anything beyond study +-0.953135 reduces my study +-0.9271722 the hard study +-0.9179611 6 day study +-1.3702492 All study +-0.7022626 and effective study +-0.83807516 , reduces study +-0.5394636 of uninteresting study +-0.7022626 in depth study +-0.5784601 , for example +-0.6068073 For example +-0.96263754 Along with studying +-0.99385834 time for studying +-1.825511 which is studying +-1.248698 she is studying +-2.3491814 a student studying +-1.64252 working , studying +-1.4319112 year , studying +-0.955451 respond by studying +-0.9673256 effectiveness in studying +-1.4206492 college and studying +-1.2539113 diligent and studying +-0.96294063 stress of studying +-1.53831 costs of studying +-1.0870826 focus on studying +-1.2210097 concentration on studying +-1.6895361 their time studying +-1.3270628 extra time studying +-1.5001065 more time studying +-1.9820553 students are studying +-2.131232 they are studying +-0.94139206 either spend studying +-0.6212433 job while studying +-0.70595443 working while studying +-1.2107427 is because studying +-0.76711124 they were studying +-0.76711124 % were studying +-0.8847982 to someone studying +-0.927614 example those studying +-0.92951506 several years studying +-0.89365876 not spent studying +-0.7747171 have finished studying +-0.96997774 medicine and engineering +-0.94992566 student studying engineering +-1.2483399 to an engineering +-0.5415809 advanced mechanical engineering +-0.9627519 living it has +-0.942605 student that has +-0.942605 study that has +-1.6089711 students that has +-0.9625333 Part-time job has +-2.339474 a student has +-2.2043755 job , has +-1.3931006 time , has +-0.9662096 poison and has +-0.8148012 studying engineering has +-1.194812 second reason has +-0.903101 money one has +-0.903101 things one has +-1.9919212 part-time work has +-0.93960273 job market has +-0.9028778 person who has +-0.9028778 Anyone who has +-0.94668555 life which has +-1.8513876 college education has +-1.2167567 adult world has +-0.88401926 , someone has +-0.9145946 and workplace has +-0.90997994 every individual has +-1.1575813 that he has +-0.8386006 a teacher has +-0.8386006 eEm Poker has +-0.7741124 online poker has +-0.5397277 new trend has +-1.3515453 second-hand smoke has +-0.5397277 the air has +-1.8425382 will be very +-1.7742618 not be very +-1.5964961 could be very +-1.7303654 is a very +-1.2559645 provides a very +-1.5247533 it is very +-1.6911908 job is very +-1.3133973 It is very +-0.9264556 graduates is very +-0.9264556 resume is very +-2.5518703 of the very +-0.9648307 meet the very +-2.0866027 are not very +-1.4396361 exposed to very +-0.94634616 engineering has very +-0.9641581 Parents work very +-1.5279132 Students should very +-1.7763419 at college very +-0.9398573 perhaps become very +-0.70341635 generally pays very +-1.1673017 , are very +-1.7168996 students are very +-1.5755847 who are very +-0.91602075 force are very +-1.5065291 It fs very +-0.9062093 is still very +-0.9006796 have worked very +-1.6167034 I was very +-0.8157978 by its very +-0.5402563 be weighed very +-1.5400867 , with little +-0.9200527 jobs with little +-1.3085594 out with little +-1.470797 and having little +-1.3388386 for a little +-1.5929991 work a little +-2.2172606 have a little +-0.7365464 earn a little +-0.9351971 know a little +-1.2021183 earning a little +-1.2678394 be of little +-0.8854596 has very little +-0.8854596 pays very little +-0.9647908 have as little +-1.8680534 students have little +-1.5701579 may have little +-0.95290124 sacrifice what little +-1.7776026 , that experience +-0.95494294 get that experience +-0.9604914 world job experience +-0.89612734 a valuable experience +-0.6730385 little valuable experience +-0.6730385 gain valuable experience +-0.6730385 provide valuable experience +-1.6719376 , the experience +-1.8057064 have the experience +-1.2323068 get the experience +-1.2323068 needs the experience +-2.0363104 they will experience +-0.997246 to acquire experience +-1.525286 skills and experience +-0.95944 maturity and experience +-0.9656664 backdrop of experience +-2.0183604 have to experience +-2.135761 able to experience +-1.8001362 chance to experience +-1.7098913 opportunity to experience +-1.7964344 well as experience +-0.9646738 what students experience +-1.5164812 Such experience +-0.4528451 hands on experience +-0.95147413 ; also experience +-0.81432205 is work experience +-1.084723 and work experience +-1.2199986 of work experience +-0.81432205 gain work experience +-0.9998067 Part-time work experience +-0.81432205 prior work experience +-0.81432205 gaining work experience +-0.81432205 Actual work experience +-1.9036644 is an experience +-1.5002863 they may experience +-1.2204484 The college experience +-0.94501483 his college experience +-1.6187613 who have experience +-1.3791095 with some experience +-0.933936 you first experience +-1.6515415 This experience +-1.2376906 gives them experience +-0.7307635 people useful experience +-0.7307635 provide useful experience +-0.7721529 provide relevant experience +-0.866115 valuable life experience +-0.866115 great life experience +-0.866115 positive life experience +-0.7009841 only providing experience +-0.54900295 of actual experience +-0.54900295 The actual experience +-1.2062893 the learning experience +-0.95127696 about my experience +-0.53858435 Such hands-on experience +-1.1095665 first hand experience +-0.7009841 no direct experience +-0.53858435 a worthwhile experience +-0.838894 provide invaluable experience +-0.53858435 provide real-world experience +-0.53858435 Work experience +-0.53858435 burger flipping experience +-1.7983265 job can gain +-1.5515639 students can gain +-2.4883869 , and gain +-1.3162407 money and gain +-1.4989645 experience to gain +-1.2339593 helps to gain +-1.6159928 able to gain +-0.95212805 stand to gain +-0.95212805 man to gain +-2.1019545 the students gain +-1.8168498 can also gain +-0.9374136 actual financial gain +-2.2606785 a student from +-0.45258546 to and from +-1.1599411 to gain from +-0.7873364 financial gain from +-0.9540956 hand working from +-0.9623042 Otherwise students from +-1.6169142 , time from +-1.2194326 transitional time from +-0.93695086 books but from +-1.7594473 not only from +-1.9234828 of money from +-0.92789894 getting money from +-0.9354825 will benefit from +-0.7279561 professional benefit from +-0.5365689 Apart from +-0.7687077 experience gained from +-1.1507621 my friends from +-1.1721927 for people from +-1.677944 young people from +-0.33989608 have borrowed from +-0.33989608 you borrowed from +-0.9506547 distracts them from +-0.88658917 that useful from +-1.003229 Learning from +-1.1324358 students get from +-0.8961284 n't get from +-0.15241522 student away from +-0.10523577 time away from +-0.15241522 take away from +-0.15241522 them away from +-0.15241522 moved away from +-0.15241522 possess away from +-1.3735092 a lot from +-1.101239 we graduate from +-0.5365689 be acquired from +-0.44602832 for transition from +-0.5160283 the transition from +-0.44602832 to transition from +-0.8629007 are learning from +-0.8629007 Whether learning from +-0.3593727 very different from +-0.3593727 are different from +-0.91974634 or support from +-0.8539863 study free from +-0.92115134 and those from +-0.86957264 may come from +-0.5365689 knowledge exclusively from +-0.5365689 more scholarships from +-0.83233684 to adapt from +-0.5365689 their leap from +-0.88535434 graduate fully from +-0.33989608 to escape from +-0.33989608 an escape from +-0.5365689 Wages from +-0.92855704 valuable things from +-0.6980578 , suffering from +-0.33989608 to refrain from +-0.33989608 should refrain from +-1.2770003 at home from +-0.5365689 , ranging from +-0.8859713 of independence from +-0.6980578 of view from +-0.5365689 to hide from +-0.6980578 a break from +-0.6980578 result direct from +-0.86880887 they save from +-0.6980578 to cities from +-0.6980578 still profit from +-1.0699387 be banned from +-0.67921686 were banned from +-1.003229 after graduating from +-0.5365689 Incomes from +-0.5365689 to comprehend from +-0.5365689 will vary from +-0.5365689 in contact from +-0.5365689 somewhat lacking from +-0.5365689 have evolved from +-0.6980578 only distract from +-0.5365689 are fatigued from +-0.5365689 will occur from +-0.95894504 problem with working +-1.715639 time for working +-0.9548179 get for working +-1.1281204 While working +-1.7239038 is that working +-1.3592727 believe that working +-1.5926366 think that working +-0.9250762 life-lesson that working +-2.4316392 of the working +-2.3323026 to the working +-1.8818469 job , working +-1.7395475 example , working +-1.3659658 conclusion , working +-1.8031971 Secondly , working +-1.4455705 second , working +-0.9362907 store , working +-1.2041423 end , working +-0.9362907 old , working +-1.2459824 of not working +-0.8660464 that by working +-1.0818306 college by working +-0.8660464 believe by working +-2.0845842 , and working +-1.8708798 money and working +-1.2181348 classes and working +-0.94378626 childhood and working +-1.5890657 experience of working +-0.9523797 expect of working +-1.4998515 instead of working +-1.6704613 of their working +-1.3018676 Of course working +-1.3938547 to experience working +-0.9196146 them experience working +-0.97466594 gain from working +-0.79775614 gained from working +-0.79775614 useful from working +-0.97466594 get from working +-0.79775614 lot from working +-0.97466594 refrain from working +-1.7896668 well as working +-1.5857228 of students working +-1.9165133 college students working +-1.5564785 their time working +-0.90139276 some time working +-1.5097613 spend time working +-1.3416611 free time working +-0.94328165 are only working +-1.6555629 And working +-0.8700354 benefits does working +-0.959562 student are working +-1.4787228 Part-time working +-0.7670313 By working +-0.70755357 f By working +-1.5967587 social skills working +-0.9169576 jobs like working +-0.86289465 time while working +-0.86289465 spending while working +-1.9975958 I believe working +-0.9484584 break fs working +-1.2083856 end up working +-0.69996405 to continue working +-0.8694601 always either working +-1.2546649 to start working +-1.6246588 rather than working +-0.92470735 , hard working +-1.1486547 was always working +-1.6504033 Students working +-1.2508029 per week working +-0.8799523 juggle both working +-1.3182389 other hand working +-0.53788227 human unhappiness working +-0.53788227 , excessive working +-0.9952305 without ever working +-0.7709515 are currently working +-0.53788227 the tedious working +-0.53788227 against anybody working +-0.8349358 a smoke-free working +-0.95454973 worked part-time as +-1.9946979 a job as +-2.2877362 part-time job as +-1.5281644 smoke is as +-2.2105148 the student as +-1.1662025 not , as +-0.91540414 increased , as +-1.4410514 school , as +-1.5653061 -RRB- , as +-1.6398562 money , as +-0.6093644 future , as +-0.91540414 regard , as +-1.1662025 success , as +-1.1662025 now , as +-0.91540414 choice , as +-1.6073389 Finally , as +-0.91540414 smokers , as +-0.92488533 as often as +-1.4127368 part-time employment as +-0.95631826 sophisticated and as +-0.8327236 true nature as +-1.8548603 to study as +-0.91484255 will experience as +-0.91484255 my experience as +-1.647098 from working as +-2.5522134 college students as +-1.4873977 Such as +-2.0252297 their studies as +-0.9195953 many jobs as +-0.9195953 get jobs as +-0.9091333 much focus as +-0.92808414 wanted -LRB- as +-0.19727507 , such as +-0.42116535 jobs such as +-0.42116535 -LRB- such as +-0.42116535 oriented such as +-0.42116535 ; such as +-0.42116535 more such as +-0.2607204 skills such as +-0.42116535 age such as +-0.42116535 virtues such as +-0.42116535 activity such as +-0.42116535 establishments such as +-1.8646834 should have as +-1.1844832 with money as +-0.9255703 is money as +-1.6184187 And as +-0.9332814 see you as +-0.93157405 lab or as +-0.93157405 hotels or as +-0.33464238 as well as +-0.8741558 immediate benefits as +-1.3760614 Such skills as +-0.94693446 as important as +-0.9203032 situate themselves as +-0.9088372 working environment as +-1.2073896 a position as +-0.88309705 and financially as +-1.6641262 to spend as +-0.8327236 several times as +-1.1460747 my friends as +-0.8732575 and put as +-1.8474128 young people as +-0.94770306 treating them as +-0.46533644 little distractions as +-0.46533644 enough distractions as +-1.4312402 of society as +-1.1836098 same activities as +-0.919852 view university as +-1.1102194 their finances as +-0.8056084 many pressures as +-0.53482395 pressures generated as +-1.1690992 of living as +-0.76573384 are almost as +-0.69552916 not require as +-1.4387888 work force as +-0.8896389 finding success as +-0.8314178 many interests as +-1.821082 to learn as +-0.86577785 but beneficial as +-0.8822819 responsibilities just as +-0.86678696 can come as +-0.73361915 as much as +-0.8314178 healthcare system as +-0.53482395 more manageable as +-1.3658822 Working as +-1.0549879 so far as +-0.9253121 Such things as +-0.8056084 also serve as +-0.90404695 my days as +-0.40050438 as soon as +-0.76573384 entirely -- as +-0.53482395 It serves as +-0.69552916 communicate socially as +-0.8327236 work together as +-0.76573384 as expensive as +-0.76573384 paid off as +-0.99813837 after graduating as +-0.69552916 be seen as +-0.8056084 and finally as +-0.53482395 are mocked as +-0.69552916 your hobbies as +-1.637024 the health as +-0.53482395 makes decisions as +-0.53482395 as open as +-0.69552916 or nightclubs as +-0.53482395 anyone else as +-0.8327236 for smokers as +-1.4503464 as a waiter +-1.6868842 that a restaurant +-0.98436725 in a restaurant +-1.9420761 to a restaurant +-2.3403213 for the restaurant +-1.418139 if the restaurant +-0.9615649 go the restaurant +-0.99710274 health of restaurant +-1.2587683 restrictions on restaurant +-1.3330526 for those restaurant +-0.5413157 local fast-food restaurant +-0.5413157 Exposing restaurant +-1.1374357 While there +-2.1990576 believe that there +-1.5823354 student , there +-1.3328489 Moreover , there +-0.92981833 Besides , there +-0.92981833 good , there +-1.1333123 However , there +-1.3328489 all , there +-0.92981833 itself , there +-0.92981833 things , there +-0.44561693 Currently , there +-0.92981833 importantly , there +-1.5377619 studying and there +-0.9627949 costly and there +-2.012047 , as there +-0.9487086 doing work there +-0.9487086 potentially work there +-0.94082564 And then there +-1.598442 , but there +-1.1111779 work but there +-1.5860317 However there +-0.9533297 -LRB- when there +-2.0266309 If there +-1.5860317 Firstly there +-1.3448688 I feel there +-0.9374259 restaurants than there +-0.84223896 -LRB- though there +-1.1374357 Perhaps there +-1.8475375 I think there +-0.5402563 and driving there +-0.5402563 is served there +-1.219691 is having ample +-2.026697 there is ample +-1.5127633 for this reason +-0.31696922 For this reason +-2.488804 , the reason +-0.70405877 is ample reason +-0.6228536 the only reason +-0.8113749 Another reason +-1.51157 most important reason +-0.3732322 The first reason +-0.861478 My first reason +-0.94084615 another good reason +-0.932323 The other reason +-0.7994546 The main reason +-0.8356117 and no reason +-1.0328804 has no reason +-0.43217087 My second reason +-1.0154324 a significant reason +-0.9420421 most obvious reason +-1.149096 is for students +-1.2462037 reason for students +-0.847671 does for students +-1.0959679 important for students +-1.149096 idea for students +-0.847671 chance for students +-0.847671 difficult for students +-0.847671 way for students +-0.847671 hold for students +-1.052028 possibilities for students +-0.8288067 reason that students +-1.2256379 jobs that students +-1.2330058 skills that students +-1.4566 believe that students +-0.8288067 consequence that students +-0.8288067 out that students +-0.8288067 hope that students +-1.2029401 things that students +-1.1121206 fact that students +-1.0222106 belief that students +-0.8288067 seen that students +-2.1679451 in a students +-2.1165128 of a students +-1.8212053 for the students +-1.5073687 , the students +-1.346804 and the students +-1.3831724 of the students +-1.7842155 to the students +-1.2909663 if the students +-1.1615291 when the students +-0.9127746 allow the students +-0.9127746 workplace the students +-0.9127746 everything the students +-1.550979 job , students +-1.4962642 example , students +-1.3264197 working , students +-1.1015888 reason , students +-1.3864871 studies , students +-1.5089428 time , students +-1.3264197 school , students +-1.4548239 college , students +-0.87796056 therefore , students +-1.2120088 university , students +-1.481932 Firstly , students +-1.1015888 independent , students +-1.5166782 Secondly , students +-1.3264197 Thirdly , students +-1.3845115 education , students +-1.2799147 addition , students +-0.87796056 offers , students +-1.2120088 Thus , students +-1.466784 Finally , students +-0.87796056 Recently , students +-1.1015888 Indeed , students +-0.6559664 for many students +-1.1301147 , many students +-0.55779207 For many students +-1.1673657 are many students +-0.9514232 flexible and students +-1.6383531 education and students +-1.2103609 cases of students +-1.7271628 idea of students +-0.7389666 majority of students +-0.93963575 success of students +-1.5858973 available to students +-0.6244689 jobs to students +-0.9537648 credits to students +-1.4922571 different from students +-0.9509391 Part-time working students +-0.9546628 just as students +-1.1780078 only reason students +-0.9551904 influences on students +-1.7426541 The students +-1.8484119 the time students +-0.90968853 high school students +-1.6214026 , if students +-1.4094825 Even if students +-0.9395773 for such students +-0.9351596 3 -RRB- students +-0.044907983 for college students +-0.14352463 that college students +-0.48693514 is college students +-0.66272634 the college students +-0.3840299 , college students +-0.29252407 many college students +-0.48693514 and college students +-0.47550857 of college students +-0.6633499 to college students +-0.48693514 For college students +-0.56450295 if college students +-0.48693514 some college students +-0.48693514 most college students +-0.48693514 more college students +-0.29252407 Many college students +-0.29252407 all college students +-0.21925443 Japanese college students +-0.29252407 If college students +-0.48693514 believe college students +-0.48693514 support college students +-0.48693514 poor college students +-0.48693514 non college students +-0.48693514 enables college students +-0.48693514 think college students +-0.48693514 easy college students +-0.48693514 All college students +-0.48693514 whereby college students +-0.48693514 among college students +-0.48693514 including college students +-0.48693514 Advising college students +-0.7116127 , some students +-0.85090923 For some students +-0.51520616 are some students +-0.7116127 think some students +-0.7116127 why some students +-0.855072 for most students +-0.38129213 , most students +-0.855072 of most students +-0.7146293 which most students +-0.8122291 Most students +-1.2220302 will benefit students +-1.0067129 money ; students +-0.8188153 world ; students +-0.94233793 campus more students +-0.7035715 it helps students +-0.7035715 also helps students +-0.95131576 solely at students +-0.8300856 Many students +-0.77138203 , Many students +-0.5273503 it gives students +-0.5273503 job gives students +-0.5273503 also gives students +-0.39835224 College students +-0.7251668 many College students +-0.94892025 can help students +-0.93922883 will help students +-1.0321584 to help students +-0.37554207 also help students +-0.37554207 jobs help students +-1.5484686 , when students +-0.93998677 from what students +-1.0254842 to encourage students +-1.2086327 for all students +-0.82190704 encourage all students +-0.82190704 Should all students +-0.82190704 behooves all students +-0.774179 that Japanese students +-0.774179 if Japanese students +-1.2007107 and therefore students +-1.1508892 can provide students +-1.157534 jobs provide students +-1.9045634 If students +-0.87268853 helps teach students +-0.39350784 jobs give students +-0.7438034 they give students +-0.31611434 with fellow students +-0.31611434 their fellow students +-0.31611434 my fellow students +-1.9606249 I believe students +-0.53456277 Many recently-graduated students +-0.6145564 for university students +-0.6145564 many university students +-0.6145564 or university students +-0.6145564 hire university students +-1.7193122 A students +-1.2067581 skills which students +-1.3478189 , these students +-0.76528955 by allowing students +-0.53456277 , encourages students +-0.8501539 also provides students +-0.9578978 , where students +-0.74171937 internships where students +-0.74171937 departments where students +-0.33891883 often expose students +-0.33891883 to expose students +-0.80512136 it requires students +-0.77669966 I feel students +-0.5510716 it allows students +-0.45155254 job allows students +-0.39056498 This allows students +-0.39056498 wage allows students +-1.4111228 So students +-1.5872836 rather than students +-0.8923882 Some students +-0.8841342 If possible students +-1.1717662 four years students +-0.84897876 then poor students +-0.8309009 would allow students +-0.50768566 that young students +-0.9374318 a young students +-0.8312716 many young students +-0.69727564 gives young students +-0.76528955 job enables students +-1.0315319 it teaches students +-0.9080606 may want students +-1.2201871 is why students +-0.53456277 were 8,000 students +-0.84897876 how few students +-0.80512136 to current students +-0.6445662 to assist students +-0.6445662 jobs assist students +-0.82838184 Otherwise students +-0.76528955 For busy students +-0.69515115 to unproductive students +-0.76528955 well off students +-0.69515115 , brings students +-0.53456277 show irresponsible students +-0.53456277 constantly pushing students +-0.53456277 for needy students +-0.53456277 could inspire students +-0.69515115 these spoiled students +-1.2606536 to not engage +-1.5565478 unable to engage +-0.96771485 adults to engage +-2.0359368 They engage +-1.2129124 can often mean +-1.821997 can also mean +-0.9653157 implemented with considerable +-0.9702419 mean a considerable +-0.8883506 undoubtedly meet considerable +-2.0614648 that the amount +-0.96683127 restricts the amount +-0.77699226 a considerable amount +-0.5673022 a small amount +-1.0175442 a significant amount +-0.88773197 relatively large amount +-0.54140407 A budgeted amount +-2.4215481 the student additional +-1.858969 amount of additional +-1.6364591 taking on additional +-1.2475339 be an additional +-1.997152 will have additional +-0.81796503 either requires additional +-0.86343884 of additional stress +-2.3008554 The stress +-0.77699226 will cause stress +-0.8632097 be added stress +-0.77699226 study causes stress +-0.7050885 debt creates stress +-0.44769293 spend it on +-1.2084172 doing it on +-1.5096954 to be on +-1.9933773 a job on +-1.9516675 part-time job on +-1.8406049 Firstly , on +-1.5753349 Thirdly , on +-1.7391676 First , on +-1.2323338 age , on +-1.9604131 are not on +-0.9561437 smoke and on +-0.9370751 smoke has on +-0.8504858 additional stress on +-0.9122792 focus less on +-1.891715 their time on +-0.55156016 and focus on +-0.45981702 to focus on +-0.16569094 should focus on +-0.55156016 today focus on +-0.69540316 focus effectively on +-2.0050051 to work on +-0.9406857 processes work on +-0.92788047 debt -LRB- on +-1.2338492 my money on +-1.3589731 and so on +-0.69540316 no guarantee on +-0.94910926 newspapers or on +-0.9297109 spend even on +-1.2357036 lives are on +-0.9427249 concentrate more on +-0.44103047 student taking on +-0.5101622 by taking on +-0.5101622 and taking on +-0.44103047 of taking on +-0.5101622 students taking on +-1.6633052 to spend on +-0.5347369 some strains on +-0.8493379 of effort on +-1.2420161 do something on +-1.1732373 only take on +-0.91692567 taking classes on +-1.3205315 extra-curricular activities on +-0.76558566 depends strongly on +-0.93225247 catch up on +-0.69540316 it depends on +-0.91920877 with going on +-0.9303192 strike out on +-0.4448707 probable impact on +-0.4448707 beneficial impact on +-0.51466846 negative impact on +-0.69540316 a determination on +-0.23189282 , based on +-0.23189282 not based on +-0.23189282 purely based on +-0.23189282 formula based on +-1.2261263 as possible on +-0.33900368 not relying on +-0.33900368 than relying on +-1.2355402 I worked on +-0.832567 think back on +-0.6019163 must concentrate on +-0.6019163 instead concentrate on +-0.5347369 active role on +-0.7674805 poor participation on +-0.6440402 detrimental effect on +-0.6440402 definite effect on +-0.4253446 student takes on +-1.1698723 to think on +-0.38874552 be focused on +-0.38874552 is focused on +-0.38874552 heavily focused on +-0.46527386 and moving on +-0.46527386 before moving on +-0.69540316 fully dependent on +-0.69540316 is completed on +-0.80544597 can build on +-0.832567 especially early on +-0.69540316 100 % on +-0.5347369 it cheaper on +-0.5347369 focus strictly on +-0.54552317 and concentration on +-0.54552317 are concentration on +-0.21704142 and rely on +-0.21704142 students rely on +-0.21704142 co-workers rely on +-0.21704142 be concentrating on +-0.21704142 not concentrating on +-0.21704142 and concentrating on +-0.69540316 negative influence on +-0.3140839 Depending on +-0.5347369 first priority on +-0.5347369 of action on +-1.0260235 are invaluable on +-0.8700384 I spent on +-0.72540855 college spent on +-1.0280948 the public on +-0.80544597 facilities purely on +-0.5347369 Or on +-0.76558566 large expense on +-0.5347369 of strain on +-0.50669473 the pressure on +-0.76558566 of difficulties on +-0.33900368 gain hands on +-0.33900368 obtain hands on +-0.3140839 Based on +-0.80544597 social groups on +-0.5347369 , lounging on +-0.5347369 people depend on +-0.5347369 Dairy Queens on +-0.5347369 work influences on +-0.29487875 a ban on +-0.29487875 outright ban on +-0.19269252 total ban on +-0.34310788 smoking ban on +-0.33900368 no restrictions on +-0.33900368 partial restrictions on +-0.5347369 too severe on +-0.97076845 near the top +-1.3793689 job on top +-0.9476186 stress on top +-0.96873605 of that created +-1.7872409 time for studies +-2.5783114 of the studies +-2.4533331 to the studies +-2.0986485 field of studies +-1.2484394 by their studies +-1.0386922 in their studies +-1.0386922 to their studies +-0.8953338 from their studies +-0.4016756 on their studies +-1.107489 have their studies +-0.82638824 how their studies +-0.82638824 furthering their studies +-0.82638824 between their studies +-1.0184413 complete their studies +-0.82638824 leaving their studies +-0.82638824 finally their studies +-1.2053039 time from studies +-1.6279718 away from studies +-1.3880014 impact on studies +-1.2226253 concentration on studies +-0.6767232 one 's studies +-1.4858656 their academic studies +-1.3906288 on my studies +-0.91852945 balance my studies +-0.8431235 University studies +-0.93225014 to our studies +-1.2155454 in his studies +-1.1770841 of his studies +-1.0451295 their daily studies +-1.0451295 my tertiary studies +-2.1346512 I also +-1.9068631 , I also +-0.9181328 And I also +-1.5400887 but it also +-1.2586496 job can also +-0.55208135 studying can also +-1.104629 students can also +-1.1185868 jobs can also +-0.78548425 organization can also +-0.78548425 People can also +-0.78548425 professional can also +-0.78548425 Jobs can also +-1.6072395 working part-time also +-2.0680072 a job also +-1.9907866 part-time job also +-1.9478422 it is also +-1.5816346 job is also +-1.139704 there is also +-1.4995198 time is also +-0.88002706 There is also +-1.401721 what is also +-0.90330285 dinner is also +-1.5780042 that will also +-1.570092 students will also +-1.5324879 They will also +-0.9682255 freedom , also +-0.96531725 saving and also +-0.95302516 's also also +-2.7587998 part time also +-1.6052718 Part-time jobs also +-1.2369223 skills may also +-0.87574697 , but also +-0.7267842 -RRB- but also +-0.7267842 situations but also +-0.7267842 subjects but also +-0.93243873 there should also +-0.93243873 offers should also +-2.0375073 , they also +-1.2019582 It 's also +-0.92545724 minimized ; also +-1.2399253 Many people also +-1.9942325 They also +-0.999271 they fre also +-0.93885475 , while also +-0.999271 you fll also +-0.88462186 experiences could also +-0.88462186 major could also +-0.9362927 it must also +-2.0570633 It also +-0.9518883 let fs also +-1.1550802 job would also +-0.90912604 books would also +-1.664405 Students also +-0.87248117 let us also +-1.8067875 also be increased +-2.2018576 should be increased +-1.2668694 enjoy a less +-1.6871892 this is less +-1.9577751 there is less +-0.45353436 less and less +-0.9640375 cautious and less +-1.9899893 lead to less +-0.9642519 system as less +-0.92066175 colleges focus less +-1.8803471 will have less +-0.94948256 could have less +-0.93638587 up doing less +-0.86209834 is usually less +-0.9434723 , much less +-0.86209834 fre far less +-1.7313014 to smoke less +-1.2289349 at this time +-1.9024174 be a time +-1.415397 is a time +-1.7427728 at a time +-1.7909808 , the time +-1.6133882 by the time +-1.813194 to the time +-1.6133882 have the time +-0.91687983 By the time +-1.168835 take the time +-1.168835 use the time +-1.4460218 all the time +-0.91687983 because the time +-1.3857548 up the time +-1.664255 Therefore , time +-1.60648 skills , time +-0.9446756 relationships , time +-1.60648 people , time +-1.8608327 Secondly , time +-0.9446756 knows , time +-0.7263254 amount of time +-0.9274055 management of time +-1.482695 lot of time +-1.4358401 development of time +-1.1878201 waste of time +-0.4450404 plenty of time +-0.9649071 down to time +-1.3848621 with their time +-1.202069 of their time +-1.0673114 spend their time +-1.0673114 when their time +-1.2076797 use their time +-1.2629896 managing their time +-0.85715467 divide their time +-0.4271561 waste their time +-0.85715467 Throughout their time +-1.3079138 of study time +-0.74787927 less study time +-0.74787927 available study time +-0.74787927 my study time +-0.74787927 reduces study time +-0.69894636 having ample time +-1.1661074 be on time +-1.1661074 work on time +-0.91535074 completed on time +-1.0011381 is less time +-1.0011381 have less time +-0.5886873 their available time +-0.5886873 little available time +-0.5886873 fs available time +-1.250819 not have time +-0.66695637 their extra time +-0.8409487 little extra time +-0.66695637 The extra time +-0.94489455 spend some time +-0.8100164 part - time +-2.0652587 , or time +-0.026906654 a part time +-0.13305442 the part time +-0.08908353 , part time +-0.13349763 and part time +-0.19653554 of part time +-0.23454951 their part time +-0.19653554 studying part time +-0.059694584 working part time +-0.19653554 students part time +-0.10459932 work part time +-0.049158163 have part time +-0.19653554 even part time +-0.19653554 through part time +-0.19653554 at part time +-0.19653554 doing part time +-0.19653554 get part time +-0.059694584 A part time +-0.19653554 these part time +-0.19653554 particular part time +-0.19653554 Working part time +-0.19653554 pointless part time +-0.19653554 usual part time +-0.19653554 had part time +-0.19653554 distracting part time +-0.19653554 Any part time +-0.93274385 have more time +-0.93274385 even more time +-1.0583068 spend more time +-0.76941514 take more time +-0.76941514 spending more time +-1.5231737 an important time +-1.3700869 most important time +-0.4249848 the same time +-0.6732492 the first time +-0.8419982 is enough time +-0.8419982 devoting enough time +-0.7025639 and spend time +-0.6367243 to spend time +-0.5695653 To spend time +-0.5695653 who spend time +-0.5695653 must spend time +-0.3289765 we spend time +-2.0404243 This time +-0.9124983 in your time +-0.9246972 families take time +-1.2042683 student fs time +-0.72274303 there fs time +-1.02261 with no time +-1.1126119 have no time +-0.8679575 itself quite time +-1.5893728 to find time +-0.33778113 a full time +-0.27531403 many full time +-0.27531403 of full time +-0.27531403 working full time +-0.27531403 work full time +-0.27531403 college full time +-0.27531403 enter full time +-0.5477344 a limited time +-0.5477344 the limited time +-0.8100164 Training requires time +-0.35964552 any free time +-0.35964552 their free time +-1.1262286 to manage time +-0.7365116 so much time +-0.9483205 During my time +-0.8100164 relatively short time +-0.76408845 it takes time +-0.64693975 job takes time +-0.16304728 Part time +-0.86930656 not waste time +-0.93323654 much easier time +-0.9305745 and his time +-0.5371813 and arranging time +-0.69894636 an excellent time +-0.8100164 of using time +-0.8100164 and over time +-0.5371813 a special time +-0.34019393 this transitional time +-0.34019393 a transitional time +-0.5371813 an opportune time +-0.5371813 the in-class time +-0.8335478 a reasonable time +-0.5371813 His time +-2.040306 that is available +-2.5083115 in the available +-1.3669411 for any available +-1.5554733 all their available +-0.92646575 what little available +-1.7719812 of time available +-1.2314352 less time available +-2.1944628 time jobs available +-1.5159715 be more available +-1.960092 student fs available +-1.016487 Jobs available +-1.6274112 has to accomplish +-1.6274112 available to accomplish +-0.9631001 task to accomplish +-0.4230254 several part-time jobs +-0.3380166 that part-time jobs +-0.4230254 having part-time jobs +-0.21204364 , part-time jobs +-0.4230254 not part-time jobs +-0.4230254 many part-time jobs +-0.647251 of part-time jobs +-0.4230254 to part-time jobs +-0.58056915 work part-time jobs +-0.4230254 only part-time jobs +-0.04567298 have part-time jobs +-0.4230254 And part-time jobs +-0.4230254 offer part-time jobs +-0.4230254 These part-time jobs +-0.4230254 do part-time jobs +-0.4230254 take part-time jobs +-0.4230254 getting part-time jobs +-0.4230254 so-called part-time jobs +-0.4230254 had part-time jobs +-0.4230254 seek part-time jobs +-0.4230254 lucrative part-time jobs +-0.89848715 formal full-time jobs +-1.2336599 in many jobs +-0.9393584 do any jobs +-1.923857 work in jobs +-1.7548519 out of jobs +-1.5345184 variety of jobs +-1.2144974 get their jobs +-1.2144974 let their jobs +-0.94184864 quit their jobs +-2.2131145 The jobs +-0.90329415 - time jobs +-0.6703452 part time jobs +-1.4741269 full time jobs +-0.43337673 Part time jobs +-0.90059876 more available jobs +-0.88324165 graduates f jobs +-1.2228583 from such jobs +-1.2152987 In most jobs +-0.9626769 doing are jobs +-0.47601426 Part-time jobs +-1.2350262 their part jobs +-1.5279676 their first jobs +-1.191155 or other jobs +-0.9492942 us what jobs +-0.9550109 , all jobs +-1.1627815 not real jobs +-1.8168966 to get jobs +-0.8835673 of graduate jobs +-0.84047526 mostly basic jobs +-0.53937554 Student jobs +-0.53937554 Real-world jobs +-0.77350855 in regular jobs +-0.53937554 of offering jobs +-0.53937554 in low-skilled jobs +-1.549265 jobs can leave +-1.9304194 having to leave +-1.2640615 -RRB- to leave +-2.0067403 when they leave +-1.4519306 when we leave +-0.9060078 until we leave +-0.9607135 remain with one +-1.4683585 opportunity for one +-1.3675767 responsible for one +-1.3675767 responsibility for one +-1.9906621 would be one +-0.96466875 such that one +-0.940976 not having one +-2.8411796 a part-time one +-0.95795685 success is one +-1.245182 careers is one +-1.4338578 On the one +-1.2505733 learning , one +-0.96073115 policy , one +-0.96073115 Regardless , one +-1.4305155 on in one +-1.2216929 responsibilities of one +-0.9456745 extent of one +-1.4766462 costs of one +-0.9456745 control of one +-1.324357 related to one +-0.96129936 university as one +-0.9614811 lounging on one +-1.7994852 is also one +-0.85804737 can leave one +-1.9145849 , if one +-0.85804737 still pursuing one +-1.4207442 through college one +-1.9140393 the money one +-0.91578025 fortunate : one +-0.8721498 exactly does one +-2.0950513 to do one +-0.92628896 , no one +-1.7582545 to make one +-1.3817657 job during one +-0.8402394 , during one +-0.9331399 environment where one +-1.1088277 is perhaps one +-0.9329765 obligatory things one +-0.53893584 further cloud one +-1.0101975 Reason one +-0.53893584 can complement one +-0.53893584 which complements one +-0.53893584 have perfected one +-0.9707589 distracted , tired +-0.95348483 leave one tired +-0.9248446 remember being tired +-0.9533981 was so tired +-0.5414925 him physically tired +-1.9503177 , and thus +-0.54174346 your horizons thus +-1.5279951 some cases unable +-0.96997774 unwilling and unable +-0.9442717 and thus unable +-2.379855 they are unable +-2.018425 from the focus +-0.9663688 appreciates the focus +-1.2669928 classes and focus +-1.5630261 level of focus +-1.8756372 time to focus +-1.4628435 unable to focus +-1.7669954 important to focus +-1.9878324 able to focus +-1.7439849 need to focus +-1.6752743 difficult to focus +-0.9415644 urged to focus +-1.7257692 and their focus +-1.4412072 students should focus +-1.3295405 Students should focus +-0.8969556 sophomores should focus +-1.1172352 of today focus +-1.0762684 and colleges focus +-1.483938 as much focus +-1.5864539 to focus effectively +-0.86392206 prioritize tasks effectively +-0.962609 ft it ? +-2.6533897 part-time job ? +-1.4331658 interested in ? +-1.2411869 that experience ? +-1.9926083 for students ? +-2.4830782 college students ? +-2.093192 their studies ? +-1.2561349 no time ? +-1.2472788 make money ? +-1.2373891 academic skills ? +-1.8505819 college education ? +-1.7518777 real world ? +-0.5396396 is subjective ? +-0.7025187 fairly straightforward ? +-0.90522367 's lives ? +-1.7467488 in Japan ? +-0.87323976 generally ? +-0.8735652 learning anything ? +-0.8151331 human resources ? +-0.8146354 preparatory stage ? +-0.5396396 and authorities ? +-0.5396396 local governments ? +-0.5396396 right competency ? +-0.7025187 and attitude ? +-1.000287 the weekends ? +-0.5396396 his bursary ? +-0.8146354 true here ? +-2.3040445 and the resulting +-2.3489137 in a lower +-2.0148659 to a lower +-0.96306306 those from lower +-2.463656 for the quality +-2.2951527 on the quality +-0.9442717 a lower quality +-1.775723 a good quality +-0.8635808 mean poor quality +-1.9019111 by the school +-2.5318186 of the school +-1.4313239 tuition , school +-0.9661274 summary , school +-1.2971802 well in school +-0.71915513 are in school +-1.1455139 better in school +-1.022514 while in school +-0.9036703 performance in school +-0.9036703 stay in school +-0.9036703 taught in school +-1.8847612 work and school +-1.6928618 home and school +-1.4355797 quality of school +-1.2937872 going to school +-1.8264908 from their school +-1.1402442 and from school +-0.900643 adapt from school +-1.1402442 escape from school +-0.96287733 % on school +-0.9477868 on only school +-1.0496387 way through school +-0.8461769 pass through school +-0.8495195 experience at school +-1.054991 time at school +-0.8495195 success at school +-0.8495195 obtain at school +-0.45315358 a high school +-0.3919559 in high school +-0.3919559 of high school +-0.3919559 from high school +-0.3919559 are high school +-0.3919559 tutoring high school +-1.2146262 job after school +-1.2146262 work after school +-1.0421301 working during school +-0.8414618 playing during school +-0.81513315 with balancing school +-0.6073832 that particular school +-0.6073832 any particular school +-0.87380946 likely quit school +-1.0951425 to afford school +-1.3506913 of his school +-0.5399039 a nursery school +-0.5399039 or cram school +-0.6412889 of attending school +-0.5501957 still attending school +-0.955337 If I work +-0.9392286 ready for work +-1.3570583 look for work +-0.9392286 looking for work +-1.2120013 they can work +-0.93715113 someone can work +-1.6467737 students that work +-0.9512776 opportunities that work +-1.5162096 provide a work +-0.9569547 require a work +-0.7955801 for part-time work +-0.6799969 that part-time work +-0.7955801 the part-time work +-0.82612497 , part-time work +-0.6707772 and part-time work +-0.43747243 of part-time work +-0.6707772 find part-time work +-0.6707772 much part-time work +-0.6707772 take-up part-time work +-1.6826359 college is work +-0.92808676 acquire valuable work +-1.6800096 is the work +-1.3295732 in the work +-1.7307553 and the work +-1.847193 of the work +-1.5672972 to the work +-1.3801256 as the work +-1.2962034 entering the work +-0.4420277 enter the work +-1.5283194 understand the work +-0.91495925 join the work +-1.4844716 course , work +-1.6705663 studies , work +-0.94796413 social , work +-1.2260262 responsibilities , work +-0.94796413 budgeting , work +-1.5244119 and not work +-1.7431947 should not work +-1.5595735 life and work +-0.94747984 go and work +-0.94747984 welcome and work +-1.1991622 life of work +-0.6166282 world of work +-1.5053513 kind of work +-0.93359584 attainment of work +-1.4369242 form of work +-1.3868617 having to work +-1.0683893 graduation to work +-1.2720686 has to work +-1.3587917 students to work +-1.1901555 have to work +-1.3955173 important to work +-1.3040968 able to work +-1.3868617 need to work +-0.8578187 all to work +-1.0683893 get to work +-1.4091799 how to work +-0.8578187 life to work +-1.1696537 used to work +-1.3424424 difficult to work +-1.3015486 learning to work +-1.2307827 start to work +-1.3424424 learn to work +-1.0683893 getting to work +-0.8578187 ready to work +-1.1696537 right to work +-1.2720686 had to work +-1.0683893 goes to work +-1.3015486 wish to work +-0.8578187 decides to work +-1.657735 of their work +-0.8056274 whether course work +-0.8056274 setting course work +-1.2771343 and gain work +-1.9403274 such as work +-1.2522401 if students work +-1.955375 part time work +-1.5482059 full time work +-1.447525 Part time work +-0.89881396 of school work +-0.89881396 balancing school work +-1.2138816 from such work +-1.3345143 and then work +-1.833572 they should work +-0.9390156 students only work +-0.95890325 's college work +-2.1202333 that they work +-0.798722 on extra work +-1.1435329 some extra work +-1.5301163 To work +-0.95189255 eat or work +-1.0554148 Part-time work +-1.1895304 for future work +-0.9246275 earned through work +-1.527358 students who work +-1.1330373 Students who work +-0.95331013 goal at work +-1.4135706 and doing work +-1.1496066 not real work +-0.8757322 to therefore work +-0.76766455 usually menial work +-0.69717115 find sufficient work +-0.769241 Having prior work +-1.1623018 students find work +-0.86534023 to either work +-0.53595734 a strong work +-0.93238914 Too much work +-0.30756125 that hard work +-0.51996696 and hard work +-0.51996696 of hard work +-0.51996696 demands hard work +-0.51996696 really hard work +-0.69717115 very definitely work +-0.991708 in America work +-0.86534023 Japan generally work +-0.43410924 these days work +-0.76766455 in gaining work +-0.53595734 subconscious processes work +-0.9755738 my class work +-0.79836 pure class work +-0.53595734 Actual work +-0.76766455 office hierarchies work +-1.309947 other hand work +-0.92094755 around our work +-1.2153925 where we work +-1.0806706 or her work +-0.80902135 to respect work +-1.0014405 Balancing work +-0.76766455 experiencing regular work +-0.53595734 do volunteer work +-0.53595734 Basic work +-0.8311293 Parents work +-0.53595734 a semi-regular work +-0.53595734 a firm work +-0.53595734 and potentially work +-2.1927767 time , being +-0.9210763 most by being +-0.9210763 ideas by being +-0.9484807 increase of being +-1.3817028 aspects of being +-1.574209 importance of being +-0.9484807 pain of being +-1.4238701 transition from being +-0.4464805 no reason being +-1.2593756 school work being +-1.5442222 or even being +-0.8868732 principle benefits being +-0.77623254 specific subject being +-0.86228335 to everything being +-0.70444465 only remember being +-0.92549914 work being produced +-0.9628949 appreciate it if +-0.96321124 might be if +-1.837999 is that if +-1.6815944 think that if +-0.9427243 feels that if +-0.92492276 for , if +-0.81680286 example , if +-0.95295614 Therefore , if +-1.6865948 money , if +-1.6832182 However , if +-1.7341313 Secondly , if +-0.92492276 Although , if +-0.92492276 head , if +-1.6492568 Finally , if +-0.92492276 soul , if +-1.4915131 smoke , if +-0.9619095 when and if +-0.9619095 adulthood and if +-2.0958757 their studies if +-0.93985224 alternatively -LRB- if +-0.5398157 be sought if +-1.680649 And if +-0.5155304 , even if +-0.87416023 want even if +-1.2377375 even more if +-0.9394387 unfortunate because if +-0.9471176 have these if +-1.1716379 the customers if +-1.1138552 real value if +-0.81542623 to decide if +-0.9363719 , than if +-1.5096374 But if +-0.8149672 help determine if +-1.1285179 need later if +-0.3413272 Even if +-0.702775 be nice if +-0.5398157 be desirable if +-0.702775 student household if +-2.3826947 should be related +-1.5021174 a job related +-1.8919646 part-time job related +-2.0130117 time job related +-1.2607083 that is related +-2.024167 is not related +-1.5469656 time work related +-1.2609017 which are related +-1.6048023 at all related +-1.9579836 student fs related +-0.46973985 not directly related +-0.46973985 ones directly related +-1.3326561 of responsibility related +-1.3870095 a smoking related +-2.0997815 the students f +-0.8767667 university graduates f +-0.9461726 ereal world f +-0.54122734 various epeople-skills f +-0.84159017 the ereal f +-0.8441873 healthy mind f +-0.54122734 of etime f +-2.2536452 can be found +-0.87781 treasurer positions found +-1.2262006 financial reasons -LRB- +-2.0542834 be a -LRB- +-2.6533897 part-time job -LRB- +-1.3141868 their chosen -LRB- +-0.9624961 build on -LRB- +-1.2235075 his studies -LRB- +-1.4239174 extra time -LRB- +-1.255451 class work -LRB- +-0.7025187 be found -LRB- +-0.8605619 energy level -LRB- +-0.93248284 good enough -LRB- +-1.5934918 of all -LRB- +-0.7025187 their commitments -LRB- +-0.84142673 of debt -LRB- +-0.9175819 for income -LRB- +-0.9186588 his way -LRB- +-0.8838247 too great -LRB- +-0.8146354 with co-workers -LRB- +-1.000287 I wanted -LRB- +-0.8384254 a boss -LRB- +-0.7025187 or applied -LRB- +-0.5396396 Reducing -LRB- +-1.4811245 on smoking -LRB- +-1.0853071 the rights -LRB- +-0.5396396 or alternatively -LRB- +-1.2597749 experience for such +-0.9620756 never be such +-0.9414313 but having such +-1.6386516 life is such +-1.6877334 skills , such +-1.5315328 activities , such +-0.9611286 habits , such +-0.9591733 need in such +-0.9591733 been in such +-0.96531725 meetings and such +-1.2008727 gain from such +-0.934523 Incomes from such +-0.96183467 and as such +-2.2664719 part-time jobs such +-0.9385984 found -LRB- such +-1.9521496 , but such +-1.2605617 As such +-0.53928757 career oriented such +-0.92545724 education ; such +-0.952908 life more such +-1.1640117 , skills such +-0.91417307 workplace skills such +-1.7634108 to make such +-0.53928757 to counter such +-1.6094098 to find such +-1.8734413 to learn such +-1.0930812 not afford such +-1.1380172 early age such +-0.7020066 important virtues such +-0.77335775 outside activity such +-0.7020066 entertainment establishments such +-0.53928757 that obstructs such +-0.53928757 would behoove such +-0.53928757 already built such +-0.53928757 for implementing such +-0.96056587 oneself with an +-1.5133054 opportunity for an +-0.9561515 disparate for an +-1.67437 could be an +-1.8755177 would be an +-1.7523344 it is an +-1.4013275 , is an +-1.3562455 time is an +-1.3145041 college is an +-0.7846648 College is an +-1.0756786 major is an +-0.8622929 expenses is an +-0.8622929 tasks is an +-0.8622929 think is an +-0.8622929 academics is an +-1.2743666 smoke is an +-0.9649224 budget , an +-1.2587804 Third , an +-2.0509198 are not an +-1.9446026 job in an +-0.95861584 necessity in an +-1.8674282 , and an +-0.959968 goods and an +-1.2609986 right of an +-1.2570083 use to an +-0.9640207 child to an +-1.2734743 for example an +-0.9439728 he has an +-0.9578613 things from an +-1.2739633 such as an +-0.8175643 financially as an +-0.8175643 them as an +-0.8175643 force as an +-0.8175643 success as an +-0.8175643 serves as an +-0.8175643 nightclubs as an +-1.3580158 gives students an +-1.3580158 give students an +-1.4706643 allows students an +-0.9613544 public on an +-0.8114997 be such an +-0.9954881 in such an +-0.8114997 make such an +-0.9462808 might have an +-1.4787086 would have an +-0.91557795 as : an +-0.9615749 weekends are an +-0.88930345 money at an +-0.88930345 discipline at an +-0.88930345 interns at an +-1.03742 and take an +-0.8384886 easily take an +-1.5973917 give them an +-1.0281518 to provide an +-0.83260334 years provide an +-1.3805608 and get an +-0.938008 round out an +-0.83999056 is now an +-0.5388479 Consider an +-1.0825324 to build an +-0.8368516 to play an +-0.88247085 can complete an +-1.2009081 to choose an +-0.83999056 sexual growth an +-0.8138162 into creating an +-0.5388479 one embraces an +-1.7514766 as an internship +-1.2385615 to this -RRB- +-0.9623233 banning it -RRB- +-0.93589675 -LRB- valuable -RRB- +-1.3157183 and less -RRB- +-0.8743072 resources ? -RRB- +-0.8743072 weekends ? -RRB- +-0.5394636 an internship -RRB- +-1.0953276 tuition fees -RRB- +-0.9352124 -LRB- future -RRB- +-0.91989565 smoke-filled environment -RRB- +-0.9321379 enlightened enough -RRB- +-0.92617565 , classes -RRB- +-0.9138811 scholastic responsibilities -RRB- +-0.54670525 , etc. -RRB- +-0.81430376 completing assignments -RRB- +-1.2258713 I did -RRB- +-0.81430376 support him -RRB- +-0.7944621 some ! -RRB- +-0.7944621 soon ! -RRB- +-0.7022626 same thing -RRB- +-0.5394636 : i -RRB- +-0.5394636 , ii -RRB- +-0.7022626 are 1 -RRB- +-0.77365947 , 2 -RRB- +-0.77365947 and 3 -RRB- +-0.5394636 themed questions -RRB- +-0.5394636 cases eliminating -RRB- +-0.5394636 the bills -RRB- +-0.5394636 such renovations -RRB- +-0.77365947 or choices -RRB- +-2.7031982 part-time job then +-1.8056265 job , then +-1.5651324 student , then +-1.1851361 graduation , then +-1.7557685 time , then +-1.4774184 school , then +-1.0052997 -RRB- , then +-1.3230798 society , then +-0.9259299 impact , then +-0.9259299 concern , then +-0.9259299 courses , then +-0.9259299 starving , then +-2.2911818 , and then +-1.4051882 working and then +-1.5163549 school and then +-1.9337413 to study then +-1.8076148 can also then +-2.003346 part-time jobs then +-1.9463176 time jobs then +-1.6937573 And then +-1.6392108 and are then +-1.231248 I did then +-0.9230772 and was then +-0.54078573 more affordable then +-1.3910253 Even then +-0.54078573 is taken-on then +-1.3170925 of this may +-0.8746164 Perhaps this may +-0.8746164 admit this may +-0.8957382 then it may +-1.3528384 but it may +-0.43724513 or it may +-0.8957382 while it may +-0.9649198 benefits that may +-1.270631 time job may +-0.94547987 research job may +-0.95902103 same student may +-1.2325654 that many may +-2.3660836 , and may +-1.2501162 customers and may +-1.0082891 work experience may +-1.1771919 This experience may +-1.6357043 , students may +-1.9718778 college students may +-0.8921814 which students may +-1.1256568 Some students may +-0.8921814 current students may +-0.8921814 off students may +-2.7555687 part time may +-2.1569738 time jobs may +-0.9624301 much work may +-1.9489073 , but may +-2.0927348 in college may +-1.234845 , they may +-1.5418081 money they may +-0.92974216 career they may +-1.921657 , or may +-0.9410018 may or may +-0.91392124 These skills may +-1.2937104 people skills may +-1.7693361 students who may +-0.90636873 this idea may +-0.7730562 and again may +-1.5806198 They may +-2.0834317 This may +-1.3933619 we do may +-0.9334794 consumerist society may +-0.91640687 The income may +-1.4559643 too much may +-0.91333777 The workplace may +-0.8373756 We may +-1.4959438 a person may +-0.5391116 advertising agency may +-0.5391116 She may +-0.8399475 These employees may +-2.0020587 may be worth +-0.5414925 be worth pursuing +-2.1818254 they have pursuing +-2.1698732 they are pursuing +-0.9552167 today are pursuing +-1.2822051 while still pursuing +-1.668955 rather than pursuing +-0.9642686 part job but +-1.2891911 job , but +-1.4329495 student , but +-0.89184326 to , but +-1.3665935 working , but +-1.1815981 students , but +-0.89184326 pursuing , but +-0.89184326 include , but +-0.43625653 themselves , but +-1.453311 life , but +-0.89184326 flow , but +-0.89184326 family , but +-0.89184326 attributes , but +-0.89184326 burden , but +-0.89184326 about , but +-0.89184326 subjects , but +-0.89184326 hard , but +-0.89184326 companies , but +-0.89184326 class , but +-1.1250782 argument , but +-0.96512383 short time but +-1.8723998 part-time work but +-2.0877457 to work but +-0.9455292 thing -RRB- but +-0.9601094 good at but +-0.5406974 present situations but +-0.77577734 academic subjects but +-0.8760164 from books but +-0.70405877 some dishes but +-0.77577734 so -- but +-1.0975178 to lose but +-0.5406974 a salary but +-0.8431235 personal choice but +-1.999989 , but otherwise +-1.2918593 that would otherwise +-1.4963984 they would otherwise +-1.9405696 , it should +-1.5041988 and it should +-1.1748718 So it should +-0.9546976 one that should +-1.3987713 life that should +-1.249496 of job should +-1.0934265 any student should +-1.5187707 college student should +-0.43139663 Every student should +-0.87306476 modern student should +-1.4326923 part-time employment should +-1.8945094 , and should +-1.9006867 to study should +-0.9443471 finished studying should +-1.2883005 the restaurant should +-1.8565471 , there should +-0.7535017 that students should +-0.9504454 , students should +-1.0993925 to students should +-0.973893 The students should +-0.8861778 college students should +-0.6593459 College students should +-0.973893 Japanese students should +-0.7972417 therefore students should +-0.41014045 feel students should +-0.973893 Some students should +-0.7972417 possible students should +-0.7972417 years students should +-1.1295161 young students should +-1.8477448 part-time work should +-1.4778132 time work should +-0.94494015 obstructs such should +-2.0841255 in college should +-1.3256472 , they should +-1.1772586 and they should +-1.2016256 as they should +-0.972792 then they should +-0.7965083 necessary they should +-1.2122362 so they should +-1.321388 when they should +-1.2624317 what they should +-0.7965083 least they should +-1.1631769 where they should +-1.9027817 young people should +-1.1360334 They should +-1.2438866 College should +-1.2188349 and therefore should +-1.6345229 It should +-1.3399664 social activities should +-1.1684611 working life should +-1.4649819 college life should +-0.946898 the education should +-0.93629825 work offers should +-0.8710913 and universities should +-0.9967413 their courses should +-0.7774541 Students should +-1.0086393 Jobs should +-0.5384087 most sophomores should +-0.8811104 public companies should +-1.15295 that he should +-0.77455467 so we should +-0.5468234 believe we should +-0.77455467 think we should +-0.77455467 age we should +-0.5384087 No deviation should +-0.8359797 Parents should +-0.77185225 extra-curricular activity should +-0.5384087 Governments should +-0.8359797 that none should +-0.42735428 that smoking should +-0.87814236 believe smoking should +-0.8359797 Smoking should +-0.77185225 smoking tobacco should +-0.94360805 experiences can only +-0.94360805 matter can only +-1.2876709 that the only +-0.96467716 rarely the only +-1.1217847 this will only +-1.3608661 jobs will only +-1.310736 college will only +-0.8899153 life will only +-1.5515805 cases , only +-1.7812603 life , only +-1.1644949 is not only +-0.7584908 , not only +-0.76972455 by not only +-0.76972455 study not only +-0.76972455 students not only +-1.1595936 are not only +-0.9331946 important not only +-0.76972455 productive not only +-0.76972455 learn not only +-0.76972455 patterns not only +-0.96710384 tuitions and only +-1.2619631 if students only +-1.4230039 focused on only +-1.4987316 even if only +-0.93363297 employment should only +-1.50549 we should only +-1.4194582 pocket money only +-0.9513355 cases they only +-1.7310394 and they only +-2.3188143 they are only +-0.703288 and history only +-1.932588 young people only +-1.2308917 students get only +-0.861423 is used only +-1.1370871 Not only +-1.3193913 I really only +-0.9215896 he was only +-1.435199 only be sought +-1.4281721 is it necessary +-1.7943184 it is necessary +-1.7620212 job is necessary +-1.52629 studying is necessary +-1.52629 what is necessary +-1.9843229 at the necessary +-1.4012872 make the necessary +-0.95560086 lack the necessary +-0.95560086 assured the necessary +-0.964656 spend as necessary +-1.2566186 important or necessary +-1.4320328 skills are necessary +-1.4031563 people skills necessary +-0.8958022 be financially necessary +-2.0488753 If necessary +-1.5221549 job for financial +-1.6069568 necessary for financial +-2.5003953 have a financial +-0.9640265 Without a financial +-2.2779722 , the financial +-0.96034753 often the financial +-2.4225917 of the financial +-1.2683165 management , financial +-1.2559953 social and financial +-0.9635045 insight and financial +-0.96370065 degree of financial +-2.155732 value of financial +-1.614841 by their financial +-1.4709522 all their financial +-0.94399005 augment their financial +-1.6447693 a better financial +-1.1598927 in personal financial +-0.5406091 in grave financial +-0.9488603 easing these financial +-1.003531 the actual financial +-0.77562577 of gaining financial +-0.7039302 part-time brings financial +-0.5406091 who face financial +-0.87471974 reasons -LRB- i.e. +-0.87471974 commitments -LRB- i.e. +-1.26735 hard and pay +-1.0448923 , to pay +-1.1778574 -RRB- to pay +-1.5930325 money to pay +-1.6822429 order to pay +-1.1778574 parents to pay +-0.9219076 make to pay +-1.4632642 had to pay +-0.9219076 capital to pay +-1.1778574 me to pay +-0.9219076 left to pay +-0.9219076 Peter to pay +-2.0997815 the students pay +-1.1915112 didn ft pay +-1.1200168 to help pay +-0.8968535 they help pay +-1.6806916 help them pay +-1.5359784 do n't pay +-0.8766697 of her pay +-0.9680406 -LRB- for tuition +-2.2663467 with the tuition +-0.9663688 cover the tuition +-0.90310866 reliance upon tuition +-1.2669928 expenses and tuition +-1.1714747 to pay tuition +-1.2260225 with college tuition +-1.7951556 of college tuition +-1.6944523 their own tuition +-1.4043257 for my tuition +-0.541139 the ever-increasing tuition +-0.9701716 were the fees +-0.9680855 reduce their fees +-0.9525429 pay tuition fees +-0.9525429 college tuition fees +-2.2027981 , or fees +-0.9322329 cover university fees +-1.0471783 my tertiary fees +-0.773456 I agree +-1.6534594 , I agree +-0.86857826 extent I agree +-0.86857826 think I agree +-0.86857826 summary I agree +-0.8632097 I completely agree +-0.9298365 couldn ft agree +-0.8260146 I strongly agree +-0.77699226 I largely agree +-0.54140407 I totally agree +-0.9383248 case with college +-1.3546969 help with college +-0.15990399 important for college +-1.3217369 idea for college +-1.1841583 things for college +-0.9253913 applying for college +-1.1293752 statement that college +-1.1293752 time that college +-0.9478923 agree that college +-1.3225204 something that college +-1.4652102 think that college +-1.2481827 opinion that college +-1.2442354 for a college +-0.5947329 that a college +-1.0662541 , a college +-1.1271267 cases a college +-1.6766065 in a college +-0.9448771 of a college +-1.2626458 as a college +-0.60018265 As a college +-1.1271267 whether a college +-0.89303946 been a college +-1.1271267 obtaining a college +-1.7520018 reason is college +-1.6330466 a full-time college +-1.8240509 that the college +-1.7952951 of the college +-1.9784914 to the college +-1.2040237 enjoy the college +-1.2040237 beyond the college +-1.6248546 against the college +-2.0893598 job , college +-2.0236847 time , college +-1.8865225 Firstly , college +-1.9565767 In college +-1.3029066 For many college +-1.3558553 too many college +-0.9323947 as any college +-0.523564 job in college +-1.5616746 students in college +-1.502531 time in college +-0.993964 are in college +-0.25118107 while in college +-0.899469 now in college +-0.899469 clubs in college +-1.4106863 college and college +-1.0079588 idea of college +-1.1579825 result of college +-1.5204561 out of college +-1.4689093 outside of college +-0.910771 purposes of college +-0.910771 kids of college +-0.4409991 ideas of college +-0.910771 dropout of college +-1.5946478 available to college +-1.9553485 going to college +-0.39086485 go to college +-1.4983349 of their college +-0.72129285 during their college +-0.90768236 finish their college +-1.15254 complete their college +-0.90768236 funding their college +-1.880662 For college +-0.95300716 friends from college +-1.7642264 The college +-1.0593362 we leave college +-1.6305867 , if college +-0.9003556 nice if college +-1.2165331 For some college +-1.5331786 student 's college +-0.9355159 that most college +-0.9522916 university or college +-0.91883224 campus ; college +-0.9458328 seeing more college +-0.7341448 themselves through college +-0.7341448 kids through college +-0.7341448 daughters through college +-1.1862028 study at college +-0.84530985 learned at college +-0.20624816 while at college +-0.84530985 course-load at college +-1.2037524 Many college +-1.5580298 , when college +-0.571961 time after college +-0.6017594 work after college +-0.49316454 do after college +-0.571961 opportunities after college +-0.49316454 adult after college +-0.49316454 encounter after college +-0.571961 until after college +-0.49316454 home after college +-1.4891198 for all college +-0.91626793 believe all college +-0.7885479 Japanese college +-0.631164 of Japanese college +-0.631164 to Japanese college +-1.5509645 If college +-1.9777458 I believe college +-1.0728788 one fs college +-0.36806634 job during college +-0.83670276 study during college +-1.4709482 to enjoy college +-0.91883224 firmly support college +-0.927618 important than college +-0.8768498 we enter college +-0.9096794 Of my college +-0.9096794 throughout my college +-0.8665341 help afford college +-0.8522217 for poor college +-0.93761134 any young college +-0.536132 , non college +-0.7679623 work enables college +-1.8065811 I think college +-0.7679623 we reach college +-0.6974243 how easy college +-1.3481685 All college +-0.536132 lives inside college +-1.3287523 of his college +-0.536132 Asia whereby college +-0.76949304 money among college +-0.99023217 , including college +-0.6974243 they finish college +-0.6974243 start post college +-1.0865512 At college +-0.7679623 after leaving college +-0.80931044 when attending college +-0.6974243 off throughout college +-0.536132 After college +-0.536132 Advising college +-1.5003241 I have +-1.5340195 and I have +-0.9398298 life can have +-0.9398298 person can have +-0.9170782 students that have +-0.9537177 people that have +-1.071181 student will have +-1.0447309 , will have +-0.93231666 and will have +-1.1393639 students will have +-0.76912177 time will have +-1.0162907 they will have +-0.76912177 you will have +-0.5409179 They will have +-0.76912177 eventually will have +-1.7011809 people , have +-0.9636208 older , have +-1.2049164 will not have +-0.6931739 may not have +-0.7348596 should not have +-0.74227816 do not have +-1.0504285 would not have +-1.1470969 did not have +-1.2944576 most cases have +-1.927043 money and have +-0.95091575 families and have +-0.95091575 spoiled and have +-1.1746957 student to have +-1.537959 , to have +-0.36382264 students to have +-1.6429949 time to have +-1.347861 only to have +-1.2694374 necessary to have +-1.6561164 have to have +-1.0912315 important to have +-1.696971 able to have +-1.5594366 chance to have +-0.9036322 seem to have +-0.9036322 companies to have +-1.347861 allowed to have +-1.1454474 me to have +-1.386761 that students have +-1.4653132 the students have +-1.4603597 , students have +-1.3004409 college students have +-1.3021259 some students have +-0.82702476 most students have +-1.2919304 all students have +-1.0524527 Japanese students have +-1.1496272 where students have +-1.764174 can also have +-0.94046164 it -RRB- have +-0.9346705 also then have +-0.80919194 many may have +-0.79326534 students may have +-0.80919194 society may have +-0.80919194 person may have +-1.3935653 student should have +-0.61745375 students should have +-0.8926903 college should have +-1.1245828 can only have +-1.1245828 they only have +-2.0756838 in college have +-1.0509065 that they have +-1.0977699 as they have +-0.4703997 time they have +-0.7495942 industry they have +-0.8443492 what they have +-1.054058 after they have +-0.34796935 because they have +-0.534558 which they have +-1.0710185 where they have +-0.9041658 than they have +-0.7495942 argue they have +-0.9709291 Maybe they have +-0.7495942 everything they have +-0.9771785 if you have +-1.0573592 when you have +-0.79942644 once you have +-1.1051831 of today have +-2.0802505 , or have +-0.8951272 so employers have +-0.95089126 apparent skills have +-0.8791257 students who have +-0.77391165 or who have +-0.77391165 People who have +-0.77391165 workers who have +-1.9780575 There have +-0.7857143 don ft have +-1.013858 didn ft have +-0.28275383 what others have +-0.729879 because others have +-1.9584014 their parents have +-0.95205367 hard people have +-1.3338923 They have +-0.811002 They fll have +-0.94062424 man could have +-1.1984981 people must have +-0.85616755 job might have +-0.869084 Restaurant workers have +-0.69970936 graduates similarly have +-1.0433918 I would have +-0.79165864 this would have +-1.0433918 it would have +-0.79165864 she would have +-0.90505856 students always have +-1.7254211 in Japan have +-0.83792454 All clubs have +-1.2588569 that we have +-1.1374707 jobs we have +-0.9947281 and thereby have +-0.5377069 what meaning have +-0.69970936 young players have +-0.31538063 You have +-0.5377069 or woman have +-0.5377069 and hardly have +-0.5377069 of US have +-1.2163496 working with money +-1.6701267 dealing with money +-1.7930532 , for money +-0.9290265 parents for money +-1.1907771 value for money +-0.9290265 exchange for money +-1.2588245 Time is money +-1.7801081 with the money +-1.7056111 is the money +-1.3643353 and the money +-1.8336463 to the money +-0.9196383 And the money +-0.9196383 Saving the money +-0.9196383 manage the money +-1.173775 needs the money +-0.9196383 receive the money +-0.9196383 invest the money +-2.1565385 time , money +-1.6461589 amount of money +-1.5292011 understanding of money +-1.4911531 lot of money +-1.4911531 sense of money +-0.17023921 value of money +-1.3326609 responsibility of money +-0.6282008 use their money +-0.9213468 having little money +-1.9859757 such as money +-1.5552491 may have money +-1.3761361 you have money +-0.55544263 that extra money +-0.23985696 the extra money +-0.68349063 little extra money +-0.49010238 some extra money +-0.49298573 in making money +-0.49298573 are making money +-0.49298573 about making money +-0.9481821 putting some money +-0.7380776 having enough money +-0.8878176 have enough money +-0.7380776 ft enough money +-0.8080353 students need money +-0.7474494 you need money +-0.7474494 we need money +-0.4492468 to earn money +-0.73125356 we earn money +-1.7326884 of what money +-1.811182 to get money +-1.4099603 to make money +-0.8580475 To make money +-0.7092762 that earning money +-0.7092762 also earning money +-1.2988025 their own money +-1.0274779 your own money +-0.91929394 things without money +-0.8133106 Time becomes money +-0.21861804 extra pocket money +-0.34104612 or pocket money +-0.9151042 spent my money +-0.9151042 spending my money +-1.308315 of getting money +-0.53893584 have saved money +-0.88265336 choice between money +-0.8725761 to waste money +-0.3879256 to save money +-0.49298573 families save money +-0.49298573 both save money +-0.87293476 would choose money +-0.70902 extra spending money +-0.70902 own spending money +-0.53893584 Extra money +-0.34104612 the prize money +-0.34104612 grand prize money +-0.6220863 do this they +-1.2408125 , be they +-0.94295377 is that they +-0.8491062 , that they +-0.91120815 studies that they +-0.91120815 work that they +-0.75451565 money that they +-0.75451565 better that they +-0.75451565 complain that they +-0.75451565 show that they +-1.0175778 something that they +-0.75451565 demands that they +-0.75451565 subjects that they +-0.6286647 things that they +-1.0824549 think that they +-0.75451565 argue that they +-0.97915286 opinion that they +-0.75451565 policy that they +-0.75451565 field\/industry that they +-0.75451565 recommend that they +-2.337437 time job they +-1.3733302 study , they +-1.5099019 studies , they +-1.0710042 college , they +-1.6269008 money , they +-0.9126189 at , they +-0.9126189 major , they +-0.9126189 financing , they +-0.9126189 excels , they +-0.9126189 necessities , they +-0.9126189 teacher , they +-0.9126189 tourism , they +-0.9126189 apply , they +-0.9126189 casino , they +-1.5923725 or not they +-1.2877989 most cases they +-1.1898217 of employment they +-1.3064984 student and they +-1.6302402 , and they +-1.5365804 studies and they +-1.1730046 tuition and they +-1.1730046 world and they +-0.91920906 technology and they +-0.91920906 distracted and they +-1.5242134 of course they +-1.2122872 are studying they +-1.5809544 the experience they +-0.86549723 , as they +-1.0360067 money as they +-0.83759403 benefits as they +-1.0360067 much as they +-1.0360067 soon as they +-0.83759403 -- as they +-1.4758177 the time they +-1.4406527 of time they +-1.210985 available time they +-1.1007947 limited time they +-1.2787179 free time they +-1.4865782 at school they +-0.41472006 it if they +-0.41472006 be if they +-0.5033906 that if they +-0.48040777 , if they +-0.25746983 and if they +-0.41472006 studies if they +-0.41472006 -LRB- if they +-0.5158645 even if they +-0.41472006 more if they +-0.41472006 these if they +-0.41472006 than if they +-0.41472006 later if they +-0.41112298 Even if they +-0.41472006 household if they +-1.4219556 , then they +-1.0775439 jobs then they +-1.9000736 , but they +-0.91194534 If necessary they +-1.6102368 during college they +-0.49563894 the money they +-1.5870886 extra money they +-0.7178971 job so they +-0.978989 is so they +-0.9193745 and so they +-0.7178971 structure so they +-0.7178971 fun so they +-0.9097528 college : they +-2.0413678 , or they +-1.1759102 chosen career they +-0.9191976 society ; they +-1.1228433 the industry they +-0.85602695 the skills they +-1.0654833 leadership skills they +-0.85602695 finding skills they +-1.3898513 the people they +-1.4507313 of people they +-0.5652196 job when they +-0.4875345 field when they +-0.4875345 school when they +-0.4875345 so when they +-0.4875345 more when they +-0.5652196 better when they +-0.4875345 people when they +-0.4875345 useless when they +-0.4875345 education when they +-0.4875345 quit when they +-0.4875345 things when they +-0.4875345 later when they +-0.94772726 Nor do they +-0.8922232 job while they +-0.6816523 jobs while they +-0.60371614 work while they +-0.6816523 mentors while they +-0.83181894 the theories they +-0.7444376 of what they +-0.79472065 to what they +-0.49312493 on what they +-0.6701323 do what they +-0.6701323 nor what they +-0.6701323 payback what they +-0.9118981 job after they +-0.71318084 student after they +-0.71318084 have after they +-0.71318084 enough after they +-1.5764045 are all they +-0.6393939 college because they +-0.5486586 more because they +-0.5486586 advantage because they +-0.5486586 workers because they +-0.5486586 activity because they +-0.5486586 behind because they +-1.5524174 If they +-0.8353951 , whatever they +-0.53630674 Neither they +-0.6976776 the least they +-1.1272053 in which they +-0.811429 activities which they +-0.811429 facts which they +-0.34688985 that once they +-0.34688985 relationships once they +-0.34688985 life once they +-0.34688985 begins once they +-0.8740336 a world they +-1.4409702 real world they +-0.6733062 is where they +-0.5759711 and where they +-0.6733062 place where they +-0.5759711 lives where they +-0.5759711 choose where they +-0.9139433 even find they +-0.6976776 or stressful they +-0.8228791 more than they +-0.4862303 , before they +-0.29219642 work before they +-0.4862303 life before they +-0.4862303 lifestyle before they +-0.86608636 h until they +-0.6976776 their classrooms they +-0.9341696 , since they +-0.7271415 wisely since they +-0.6976776 would argue they +-0.23432973 Maybe they +-1.2431045 per week they +-0.7682603 make sure they +-0.53630674 the information they +-0.6976776 social connections they +-1.2869445 When they +-0.6976776 I doubt they +-0.8525835 almost everything they +-1.0325394 educational materials they +-0.6976776 and dedication they +-0.6976776 spend anyway they +-2.388889 , I say +-1.7409415 not to say +-1.8353901 as they say +-1.5847247 To say +-1.2482055 Some people say +-0.9552814 Some would say +-0.99574435 job that you +-1.6597753 is that you +-1.1625066 parents that you +-0.91332567 see that you +-1.3753752 something that you +-1.2438066 this , you +-1.9829057 students , you +-1.8857255 money , you +-1.5172676 second , you +-0.96692485 teaching and you +-0.90563214 And if you +-0.90563214 value if you +-0.89365876 also benefit you +-0.926783 Don ft you +-0.88570225 dorm gives you +-1.5157772 the people you +-0.8741898 eventually see you +-0.8500879 food when you +-0.8500879 help when you +-0.8500879 fs when you +-0.5400801 and practices you +-1.1877905 the classes you +-1.5848917 If you +-0.8422266 do whatever you +-1.5813137 can get you +-0.8847982 jobs teach you +-0.87549525 and once you +-0.8741898 position until you +-0.8393022 by definition you +-0.5400801 Thank you +-0.70315975 like everywhere you +-1.7262436 students that extra +-2.3315494 for the extra +-2.2931683 , the extra +-1.2513081 use the extra +-0.96924686 need of extra +-1.5554733 all their extra +-0.725825 a little extra +-1.5110303 to gain extra +-1.6340674 taking on extra +-2.2842665 The extra +-1.2459265 take an extra +-0.7742088 on some extra +-0.7742088 making some extra +-0.7742088 earn some extra +-0.7742088 provide some extra +-1.4443774 food , making +-1.4395255 interest in making +-1.5654185 purpose of making +-1.910978 , to making +-1.7995653 , then making +-2.367737 they are making +-1.2085187 learn about making +-0.91710436 student with some +-1.5279249 , with some +-0.91710436 are with some +-0.929483 that for some +-1.1916115 but for some +-1.3320012 responsible for some +-0.929483 traumatizing for some +-1.2653023 So , some +-1.5910697 In some +-0.85410535 , In some +-1.0632576 , in some +-1.5310109 and in some +-0.44721836 or in some +-1.2046691 classes in some +-1.2622573 be of some +-1.50516 For some +-1.3715614 , has some +-1.6223023 taking on some +-0.87354934 then making some +-1.1385796 There are some +-1.54278 should spend some +-0.93453825 who need some +-0.88304746 may put some +-0.7020066 started putting some +-1.7353704 to earn some +-0.89884436 into use some +-0.9275736 will provide some +-0.88304746 may teach some +-1.50776 there fs some +-1.6094098 to find some +-0.92005956 career without some +-0.89175254 are just some +-1.2998837 it means some +-1.131519 into practice some +-1.8376 I think some +-0.53928757 or clean some +-0.89175254 reason why some +-0.7020066 to avoid some +-0.87248117 may lose some +-2.0096276 not be so +-2.4317956 time job so +-2.3096955 it is so +-1.8545883 that is so +-1.5630271 studying is so +-0.9519362 little , so +-1.9356285 students , so +-1.8452226 Firstly , so +-0.9519362 schooling , so +-0.9519362 low , so +-1.8499852 is not so +-1.2212795 were not so +-1.9583555 money and so +-0.95455295 productivity and so +-1.2386113 children and so +-1.4102856 students working so +-1.8072106 is also so +-1.9071451 not have so +-0.94739175 today have so +-1.6771417 And so +-1.621283 has become so +-2.0670385 There are so +-1.5752529 we are so +-0.8839333 They learned so +-1.436986 , doing so +-0.915077 not do so +-1.1866555 to do so +-0.9390374 properly because so +-0.9557189 brought them so +-1.625029 of education so +-0.53955156 with structure so +-0.9284557 40 years so +-1.6088734 I was so +-0.9106502 actually go so +-0.7744382 have fun so +-0.8923161 see why so +-0.8934083 or later so +-0.53955156 and chores so +-0.94295955 banned smoking so +-1.2577947 in with bad +-2.5020983 is a bad +-1.441851 about the bad +-0.964656 is as bad +-1.2350346 not so bad +-1.8002036 A bad +-0.41620955 it taste bad +-0.8872421 they develop bad +-0.541139 it smells bad +-1.0976243 for the following +-1.2530934 offer the following +-0.9620223 classes the following +-2.309394 The following +-1.3885677 in these following +-1.1108903 work world following +-0.8834946 Disney world following +-1.2792873 following reasons : +-1.4840984 two reasons : +-0.96667266 are to : +-0.96667266 limited to : +-2.0182943 such as : +-2.1150353 in college : +-0.8436701 in particular : +-0.87664145 ultimate purpose : +-0.7760808 so fortunate : +-0.7760808 and budget : +-0.8866888 the spending : +-0.895145 gave me : +-0.86209834 sleeping habits : +-0.540874 valid counter-argument : +-1.1778303 reasons : - +-1.7018135 , part - +-0.93796015 @ society - +-0.94074243 hand smoke - +-0.9220978 purpose : To +-0.8184666 : - To +-1.5458702 college will begin +-2.227174 , students begin +-1.1618018 To begin +-0.78292996 - To begin +-1.047435 will likely begin +-0.7771444 the difficulties begin +-0.97055566 with , today +-2.0583153 In today +-0.9617469 up in today +-1.2525551 person in today +-0.45478368 youth of today +-0.902998 , employers today +-0.89609206 around me today +-0.5413157 is happing today +-1.7171768 , it 's +-1.0954514 And it 's +-1.2041106 If it 's +-0.8742827 Because it 's +-0.8742827 however it 's +-0.96781695 course that 's +-1.5502574 a student 's +-1.3850042 the student 's +-1.156677 individual student 's +-0.59976757 of one 's +-0.8014488 to one 's +-0.6751726 pursuing one 's +-0.6751726 cloud one 's +-0.6751726 complement one 's +-0.6751726 complements one 's +-0.8872421 , today 's +-1.2471619 other people 's +-1.6510937 It 's +-0.8120487 That 's +-1.5116677 a person 's +-1.7671072 the job market +-1.2299348 's job market +-0.9382583 current employment market +-1.4250622 one can become +-2.0804126 they will become +-1.8608751 -RRB- , become +-0.9575198 it and become +-1.2443358 mature and become +-0.9575198 wings and become +-2.0845528 students to become +-1.1601818 order to become +-1.1862712 people to become +-1.7332585 them to become +-0.92655444 Japanese to become +-1.709454 how to become +-1.4968944 learning to become +-0.92655444 plan to become +-0.92655444 obliged to become +-0.8218461 wish to become +-0.56800747 it has become +-0.7004492 , has become +-0.56800747 work has become +-0.56800747 market has become +-0.56800747 education has become +-0.56800747 Poker has become +-0.56800747 poker has become +-1.6448618 before they become +-1.346302 They become +-0.88694775 students perhaps become +-0.5410506 when workloads become +-2.5231664 is a highly +-1.2672445 society is highly +-1.6449797 has become highly +-1.4012885 I would highly +-0.8186068 become highly competitive +-1.2461556 much more competitive +-1.7333703 time for most +-0.9580987 independence for most +-1.5550297 experience that most +-1.4368156 point is most +-1.0097463 is the most +-2.266187 , the most +-1.4129378 make the most +-1.2441878 Generally , most +-1.775661 First , most +-0.9574435 Australia , most +-0.9574435 Usually , most +-1.5972972 In most +-1.764364 , in most +-0.8378302 and in most +-0.949574 which in most +-1.8887502 , and most +-0.9627949 first and most +-0.96319383 concern of most +-0.96319383 atmosphere of most +-1.2608702 is their most +-1.8261944 The most +-1.549311 should spend most +-0.88579214 , entering most +-0.9476098 f which most +-1.6555415 they would most +-1.2255617 will learn most +-1.450238 So most +-1.1374357 Perhaps most +-0.8157978 at its most +-1.4909871 of a degree +-1.3872102 students a degree +-1.5034742 earn a degree +-1.5034742 getting a degree +-0.9680855 receive their degree +-2.2582183 a college degree +-0.96443766 diploma or degree +-0.8772552 a business degree +-0.54140407 a four-year degree +-1.5555109 as it does +-0.89609206 college degree does +-0.88761127 other benefits does +-0.84176666 The alternative does +-0.95307374 for what does +-0.5413157 skills exactly does +-1.2114555 breathing smoke does +-1.2609444 does not guarantee +-1.197714 with no guarantee +-0.9553819 of it or +-1.8259062 a job or +-2.0945776 part-time job or +-1.630125 time job or +-1.4717526 after graduation or +-1.0830758 part-time , or +-1.0830758 restaurant , or +-1.3514886 studies , or +-1.2957865 school , or +-1.3985196 work , or +-1.391331 -RRB- , or +-0.8668035 pay , or +-1.4474884 money , or +-0.8668035 some , or +-1.1882803 food , or +-1.2522956 activities , or +-0.8668035 ability , or +-1.3527229 education , or +-1.0830758 come , or +-0.8668035 rest , or +-0.8668035 service , or +-0.8668035 day , or +-0.8668035 studied , or +-0.8668035 us , or +-1.0830758 restaurants , or +-0.8668035 over-exertion , or +-0.8668035 depression , or +-0.8668035 patrons , or +-0.8668035 diners , or +-1.9772217 of study or +-1.2085934 , studying or +-1.2898896 with little or +-0.9518974 hard working or +-1.5386825 a restaurant or +-2.0298295 their studies or +-1.8972069 their time or +-0.9565012 either work or +-0.8613693 Reducing -LRB- or +-0.8613693 rights -LRB- or +-0.9468388 employees may or +-1.142789 pay tuition or +-1.2451941 with college or +-0.9262198 doing well or +-0.53517234 the newspapers or +-0.8513167 to whether or +-0.80625856 corporate organization or +-2.0241475 is important or +-1.307558 fs important or +-1.2407398 are learned or +-0.90089345 fast food or +-0.78134453 by parents or +-0.9637829 their parents or +-0.78134453 like parents or +-1.0561433 their major or +-0.9177739 attending classes or +-0.6960337 different fields or +-0.8513167 or club or +-0.9205035 a university or +-0.8740175 personal expenses or +-0.53517234 , grants or +-1.2880837 a family or +-0.94195 much education or +-0.931 mention learning or +-0.53517234 becomes time-consuming or +-0.6960337 by organizations or +-0.53517234 their son or +-0.4128095 dedicate him or +-1.158045 the government or +-0.80625856 failing courses or +-0.53517234 of book or +-0.83210784 for rent or +-0.53517234 Cleaning toilets or +-0.8333507 international law or +-0.904779 five days or +-1.1723751 doing homework or +-0.53517234 late teens or +-1.0698128 a positive or +-0.55258316 a class or +-0.6442354 to class or +-0.55258316 swimming class or +-0.55258316 miss class or +-0.46558684 a company or +-0.46558684 bad company or +-0.6960337 good references or +-0.62935984 If he or +-0.62935984 since he or +-0.62935984 principles he or +-0.53517234 he likes or +-0.6960337 -LRB- finance or +-0.53517234 graphic design or +-0.53517234 , sports or +-0.53517234 do sooner or +-1.0571599 to his or +-0.85086954 when his or +-0.53517234 any delay or +-0.6960337 their schools or +-0.53517234 to partly or +-0.6960337 a lab or +-0.53517234 their house or +-0.83210784 fellow employees or +-0.53517234 at hotels or +-0.8295817 Whether or +-0.53517234 young men or +-0.33921602 lab experiments or +-0.33921602 chemistry experiments or +-0.6960337 as bars or +-0.8295817 a diploma or +-0.76632696 young man or +-0.53517234 convenience stores or +-0.53517234 their sons or +-0.83210784 people eat or +-0.6960337 with worries or +-0.53517234 working 30 or +-1.8514657 job that pays +-0.8777878 work generally pays +-0.96308273 productive with well +-1.4116884 do this well +-1.256484 do it well +-2.2013578 can be well +-1.5584325 becoming a well +-2.002797 This is well +-1.5189523 some cases well +-0.88713115 job as well +-0.6211858 , as well +-0.7375912 study as well +-0.88713115 jobs as well +-0.88713115 money as well +-0.7375912 environment as well +-0.7375912 people as well +-0.7375912 finances as well +-0.7375912 beneficial as well +-0.7375912 socially as well +-0.7375912 smokers as well +-1.4953535 do so well +-0.7035448 that pays well +-1.2650701 As well +-0.9352755 not doing well +-0.9163802 To do well +-0.9163802 would do well +-0.7035448 what works well +-0.9459612 graduates could well +-0.8398291 to function well +-1.7896627 A well +-0.8609898 perform tasks well +-2.8185043 of the advertisements +-0.968682 advertisements that appear +-1.4073776 this may appear +-2.548219 in the newspapers +-0.9655893 or on on-line +-0.5417812 on on-line portals +-2.5504236 , the preferred +-1.37353 is much preferred +-1.0783447 are pursuing candidates +-0.7055833 the preferred candidates +-2.0346606 job , even +-1.9431272 students , even +-1.8087926 work , even +-0.952813 And , even +-0.952813 addicted , even +-1.8278681 may not even +-1.5189523 some cases even +-1.6616933 home and even +-0.95612115 authority and even +-0.95612115 couples and even +-0.96822083 performance of even +-1.4399799 exposed to even +-0.9411098 boss -LRB- even +-1.5118992 students may even +-1.1700215 They may even +-1.2593478 people they even +-1.1363616 , or even +-0.9169508 courses or even +-1.1689619 experiments or even +-0.7035448 preferred candidates even +-1.7192471 to spend even +-0.94355804 take up even +-1.5143236 But even +-0.8853834 or perhaps even +-1.5329953 they want even +-1.0143803 to fulfill even +-0.7035448 and possibly even +-0.96894807 even for entry +-0.958831 distract this level +-1.5648235 into a level +-2.4274762 , the level +-2.4897091 to the level +-1.4000356 the necessary level +-0.54140407 for entry level +-0.9438248 and energy level +-0.93024135 their part-time positions +-0.93024135 fill part-time positions +-1.2282406 in such positions +-0.86343884 entry level positions +-0.92180777 into high positions +-0.81796503 manual labor positions +-0.7050885 and treasurer positions +-0.88899493 The reasons are +-1.3083148 main reasons are +-1.3491485 experience that are +-1.4119134 jobs that are +-0.8942429 industry that are +-0.7989791 skills that are +-1.2479416 life that are +-0.8942429 Students that are +-2.0136216 a job are +-2.1437654 time job are +-2.0229115 college student are +-1.533756 study , are +-2.0731533 time , are +-1.9642367 , and are +-1.1979706 personality and are +-1.5965812 parents and are +-0.93294895 anything and are +-1.1979706 confidence and are +-0.62328714 , there are +-0.9554113 and there are +-0.78484935 Perhaps there are +-1.1860833 that students are +-1.2586524 the students are +-1.0671086 , students are +-1.1274846 many students are +-0.9363593 and students are +-1.0499752 to students are +-0.7718941 reason students are +-0.7718941 time students are +-1.4367206 college students are +-0.9363593 Most students are +-0.9363593 ; students are +-0.86588967 College students are +-0.7718941 when students are +-0.7718941 recently-graduated students are +-1.0767567 young students are +-1.454559 part-time jobs are +-1.3469226 time jobs are +-0.8805959 f jobs are +-0.69729763 tasks effectively are +-1.895765 , but are +-2.1492019 a college are +-1.9165279 in college are +-1.1244892 that they are +-0.7727382 , they are +-0.986045 and they are +-0.70653486 course they are +-0.7659429 as they are +-0.70653486 school they are +-0.4482995 if they are +-0.9677545 money they are +-0.9014421 skills they are +-0.45467296 when they are +-0.21318287 while they are +-0.70653486 theories they are +-0.9333339 once they are +-0.70653486 find they are +-0.6569747 before they are +-0.70653486 week they are +-0.65922344 that you are +-0.8293688 when you are +-0.659165 classes you are +-0.36419225 If you are +-0.659165 definition you are +-0.88486123 The following are +-0.8766697 employers today are +-0.8663598 level positions are +-0.85569656 many skills are +-1.1653135 Such skills are +-1.1653135 these skills are +-1.0727314 These are +-0.90082943 students who are +-0.6268552 candidates who are +-0.35116217 people who are +-0.8393837 those who are +-0.6268552 groups who are +-0.6268552 individuals who are +-0.6268552 patrons who are +-1.1795483 smokers themselves are +-0.24722654 There are +-1.2720731 their parents are +-0.8681273 if parents are +-0.9263121 are doing are +-0.91777855 as people are +-0.91777855 less people are +-0.80934 They are +-1.1210108 People are +-0.9194754 such classes are +-0.83130157 Workers are +-0.53604466 Evaluations are +-0.31465554 Internships are +-0.92764527 These activities are +-0.9066931 balancing responsibilities are +-1.1782702 at university are +-0.72722673 if finances are +-0.72722673 unless finances are +-0.8848671 internships which are +-0.8848671 paths which are +-0.8759231 these expenses are +-1.3561639 , these are +-0.9109254 If customers are +-1.4479908 work force are +-0.53604466 College campuses are +-0.53604466 and discovery are +-0.98998374 these factors are +-1.3730476 Some are +-0.93041277 work offers are +-1.0016955 work ethics are +-1.1051209 their lives are +-0.92039794 our lives are +-0.927581 These things are +-1.7078087 in Japan are +-0.908654 of hours are +-0.90661454 Those days are +-0.53604466 they never-the-less are +-0.8759231 card companies are +-0.69729763 and reports are +-1.0078307 that we are +-1.0757267 , we are +-0.7713833 reason we are +-0.402268 if we are +-0.69729763 again classmates are +-0.86719453 The children are +-0.53604466 of shoppers are +-0.53604466 point averages are +-0.8078884 The weekends are +-0.53604466 for superiors are +-0.53604466 being rejected are +-0.53604466 dollar signs are +-1.3655716 ban smoking are +-1.1924592 who smoke are +-0.83130157 Restaurants are +-0.53604466 , unknowingly are +-0.9710474 two , As +-1.1455239 are several ways +-2.1841552 they have ways +-1.3885677 in these ways +-0.94315493 far out ways +-1.9617627 for a career +-0.9498522 pursue a career +-0.9498522 planning a career +-1.2296149 creating a career +-0.9691185 array of career +-2.2879853 in their career +-0.6849178 the chosen career +-0.91616917 their chosen career +-1.7755643 that are career +-0.85785913 her future career +-0.85785913 desired future career +-0.8624363 a different career +-0.40376922 a successful career +-1.2420009 to my career +-0.9427098 those whose career +-0.5409623 fs targeted career +-0.9307643 are career oriented +-0.9582222 supplemented by internships +-1.565553 form of internships +-2.0306695 such as internships +-0.89666134 not offer internships +-2.4436839 in a practical +-0.9404519 them valuable practical +-0.9707589 internships , practical +-0.877244 student see practical +-1.2339287 to develop practical +-2.2924817 The experiences +-0.8441873 , practical experiences +-1.5643188 These experiences +-0.862839 the educational experiences +-0.9461372 unstructured social experiences +-0.9424741 great learning experiences +-0.8441873 with invaluable experiences +-1.1412889 Those experiences +-1.2616045 Another option +-0.96521807 available job offer +-2.0894058 do not offer +-0.9692575 interests and offer +-1.6463622 has to offer +-0.96713555 utmost to offer +-1.6197623 Part-time jobs offer +-0.9569107 student may offer +-2.0279055 They offer +-0.94775486 experience could offer +-0.75314444 be several benefits +-0.75314444 offer several benefits +-0.9665229 conclusion the benefits +-1.5519412 up the benefits +-1.7147784 are many benefits +-0.8873397 The potential benefits +-0.9335651 what other benefits +-0.8441873 show immediate benefits +-1.4046149 Some benefits +-0.7048308 The principle benefits +-1.5927669 that will benefit +-1.1739321 work will benefit +-0.9197259 Others will benefit +-1.3807653 , has benefit +-1.2443275 could also benefit +-1.3518015 the financial benefit +-0.8177979 hold another benefit +-0.89609206 any professional benefit +-1.2857119 the added benefit +-1.0720369 be a part +-1.7447491 having a part +-1.521263 , a part +-1.145143 not a part +-1.0488638 cases a part +-1.475688 in a part +-1.4908571 of a part +-1.145143 has a part +-1.202685 from a part +-0.34968758 working a part +-1.2963437 on a part +-0.83470935 work a part +-0.65079004 have a part +-0.840756 Having a part +-0.84569174 need a part +-1.0488638 doing a part +-0.8755362 get a part +-1.0488638 believe a part +-1.1868877 find a part +-1.9464405 job is part +-2.0274973 and the part +-1.500505 to the part +-2.0695806 on the part +-1.9539262 at the part +-1.2234309 this , part +-1.2234309 often , part +-1.3766057 Moreover , part +-1.2234309 Additionally , part +-1.808915 Firstly , part +-1.5559587 Thirdly , part +-2.357961 , and part +-0.9601441 summers and part +-1.26125 range of part +-2.101918 in their part +-2.101918 to their part +-1.2209332 were studying part +-1.3786697 , working part +-1.2244382 and working part +-0.58397394 By working part +-0.85511935 believe working part +-1.5501357 university students part +-1.1816928 that work part +-1.8768463 to work part +-0.92403084 they work part +-1.1727594 by being part +-1.6208706 to have part +-1.5717242 students have part +-1.077507 should have part +-1.3899235 who have part +-1.5578457 , even part +-0.98059773 an important part +-1.1939646 learned through part +-1.5184757 work at part +-0.9323283 student doing part +-0.91625166 on your part +-1.1899354 and take part +-1.811182 to get part +-0.7772107 A part +-0.80078924 c A part +-0.9451895 Sometimes these part +-1.0400496 a particular part +-0.53893584 could consume part +-1.3904204 Working part +-0.53893584 and pointless part +-1.0400496 an essential part +-0.53893584 the usual part +-1.17519 students had part +-0.53893584 an indispensable part +-0.937617 is indeed part +-0.70149505 for distracting part +-0.8370262 Any part +-1.2079217 time employment lays +-0.96742356 lays the foundation +-0.96742356 lay the foundation +-1.3052491 a real foundation +-0.970309 physics and history +-2.0756483 the work history +-0.9590756 useful for future +-0.9590756 contacts for future +-1.4461963 for the future +-1.4254631 in the future +-2.2278903 to the future +-0.95530033 maturing the future +-1.7029536 with their future +-1.6971843 for their future +-1.3241117 of their future +-0.92981315 shape their future +-0.9425815 chosen -LRB- future +-1.1727114 to my future +-0.9190456 jeopardize my future +-1.0750954 for his future +-1.0750954 to his future +-1.0984732 for her future +-1.0162231 the entire future +-0.9427098 their desired future +-0.5409623 a brighter future +-0.5409623 can tolerate future +-0.94767666 weighed very carefully +-2.283659 students should carefully +-1.4617293 By carefully +-0.77729654 should weigh carefully +-1.2588104 employers will consider +-2.0027337 have to consider +-0.95223993 well to consider +-1.8398224 important to consider +-0.95223993 factor to consider +-1.2341734 impossible to consider +-0.8181321 should carefully consider +-0.94057494 we must consider +-0.8773025 that universities consider +-0.97038597 consider and select +-2.0678182 In order +-1.3971903 , in order +-0.86438 increased in order +-0.86438 accomplish in order +-1.289309 jobs in order +-1.431872 work in order +-0.86438 money in order +-1.0790951 positions in order +-0.86438 ways in order +-0.86438 activities in order +-0.86438 year in order +-0.86438 essential in order +-1.5322751 a valuable skill +-0.9536431 perfected one skill +-0.96616054 valuable work skill +-0.9580523 essential life skill +-0.8884857 f By +-0.8186782 By carefully selecting +-2.5528169 , the gaps +-0.96412337 relationship with academic +-2.0333517 with the academic +-2.1301122 in the academic +-2.0043886 and the academic +-1.8654889 from the academic +-1.385065 outside the academic +-1.265775 spent in academic +-0.96988857 deleterious to academic +-1.6242808 with their academic +-1.5046768 and their academic +-1.9538615 on their academic +-0.91550964 sacrifice their academic +-0.91550964 build their academic +-0.9643865 important as academic +-1.4263175 focused on academic +-1.8247619 not only academic +-0.9328868 improve our academic +-0.9392128 focuses his academic +-0.8171302 just purely academic +-1.5664853 level of qualification +-1.5318627 the academic qualification +-2.2551448 can be minimized +-1.6296837 or not ; +-1.2263513 the studies ; +-1.8452317 in school ; +-1.3925898 of money ; +-1.6804159 the future ; +-0.54052097 be minimized ; +-1.3487473 in society ; +-0.92405874 a living ; +-0.816297 separate arguments ; +-0.95013785 higher education ; +-1.7631538 real world ; +-1.1821191 on campus ; +-0.7038017 can suffer ; +-1.1294426 and independence ; +-0.7754742 individual basis ; +-0.54052097 desired expertise ; +-2.3301516 , it makes +-2.414362 the student makes +-0.9647491 year and makes +-0.9647491 foods and makes +-0.9585812 also experience makes +-1.1818957 smoky environment makes +-2.1357827 This makes +-2.0887165 It makes +-0.54122734 This indirectly makes +-1.2703568 makes the candidate +-0.86392206 a poor candidate +-1.7599955 to be more +-1.5808392 also be more +-1.4958079 could be more +-1.6259205 would be more +-1.45789 and having more +-1.2499466 lead a more +-1.5288647 becoming a more +-1.5547073 studying is more +-0.9462861 change is more +-0.9462861 employee is more +-0.96515733 entrance of more +-1.6034087 a little more +-1.53299 free time more +-0.89254904 ft agree more +-1.8543806 will have more +-1.9313538 they have more +-1.2433991 their money more +-1.6064974 has become more +-0.9571094 days or more +-1.1973464 and even more +-0.8711045 up even more +-1.6957831 students are more +-1.5585768 who are more +-0.9122387 People are more +-0.9122387 smoke are more +-0.8895055 and offer more +-0.7004738 the candidate more +-0.7803752 not spend more +-1.1709386 to spend more +-0.9488013 students spend more +-0.8825364 cards gives more +-1.1863052 to other more +-0.9270141 can take more +-1.1430278 will know more +-0.8997754 educational opportunity more +-0.8811952 has done more +-0.9493133 What fs more +-1.3803583 a lot more +-1.5903656 college life more +-0.8899329 my finances more +-0.857178 still provides more +-0.37896648 to contribute more +-0.83887684 from campus more +-1.2942199 so much more +-0.8710827 become much more +-0.91694707 education was more +-1.0380341 should concentrate more +-1.495875 But more +-0.8565836 are far more +-0.91354114 you want more +-0.880724 was made more +-0.8127947 help concentration more +-0.53823316 to assure more +-0.7004738 can bring more +-0.8811952 and spending more +-0.7004738 and seeing more +-0.95854473 candidate more attractive +-2.0586054 to a potential +-2.4583068 of the potential +-2.3548265 to the potential +-1.5337261 consider the potential +-0.9680855 compromise their potential +-2.3008554 The potential +-0.81796503 the cash potential +-0.91423696 their full potential +-0.9676906 factor that employers +-2.7464805 of the employers +-0.970251 experiences , employers +-1.5938838 , many employers +-1.6429511 ban on employers +-0.9461294 2 -RRB- employers +-1.5805553 , so employers +-1.2316408 the potential employers +-1.6931171 Many employers +-0.5410506 Potential employers +-1.7106184 and it helps +-2.8487937 a part-time helps +-2.0092807 part-time job helps +-2.242168 time job helps +-1.206362 Part-time employment helps +-1.4066516 job also helps +-0.8443649 hotel industry helps +-0.70495963 Practical background helps +-2.0313077 would be better +-2.0486953 having a better +-1.9477454 in a better +-1.439628 gain a better +-2.2116642 have a better +-0.7361337 them a better +-0.9344425 instils a better +-2.166562 It is better +-0.9689299 job the better +-2.3934903 the student better +-1.4757301 , will better +-1.0271429 students will better +-0.8431235 I developed better +-1.5385073 to do better +-1.1809194 students do better +-0.8632767 general do better +-1.7943159 A better +-0.78891027 be much better +-1.038497 is much better +-0.78891027 environment much better +-0.86192465 bridging effect better +-0.86172855 is far better +-0.8437642 I focused better +-0.84053266 I got better +-0.96471906 tie-in with understanding +-1.2654307 foundation for understanding +-1.8237026 well as understanding +-0.95896196 them an understanding +-1.6576517 one 's understanding +-0.9642324 learning or understanding +-0.67678577 a better understanding +-0.9679412 employment , whether +-1.5574268 study , whether +-0.96988934 decision of whether +-1.2699976 as to whether +-1.2092912 worrying about whether +-1.0057982 and decide whether +-2.4549673 in a corporate +-2.797555 of the organization +-0.5415809 a corporate organization +-1.7085686 the same organization +-0.77729654 a service organization +-2.2403214 as a hotel +-0.9652262 organization or hotel +-1.7512143 in the industry +-2.5041761 to the industry +-0.9650539 fields or industry +-0.7054753 or hotel industry +-1.2707573 helps to evolve +-0.9604183 me with skills +-0.9343235 builds valuable skills +-2.5759876 of the skills +-0.9648221 knowledge , skills +-0.9648221 Well , skills +-1.4937348 too many skills +-1.2380977 put their skills +-0.95428586 sharpen their skills +-0.9508152 effective study skills +-0.8236025 Such skills +-2.1895845 The skills +-1.2518615 These are skills +-0.8392449 develop practical skills +-1.1927592 their academic skills +-0.8255273 our academic skills +-0.53876007 to evolve skills +-1.5402018 These skills +-0.3409608 the leadership skills +-0.3409608 include leadership skills +-0.37923184 and interpersonal skills +-1.339161 time management skills +-0.81367016 and increase skills +-1.2812432 the people skills +-1.1024694 and people skills +-0.8784866 Those people skills +-0.9480985 -LRB- what skills +-0.7668956 and social skills +-0.7198294 building social skills +-0.7198294 stronger social skills +-0.7198294 communicative social skills +-0.9535544 and life skills +-1.2785089 their personal skills +-0.8209372 improve these skills +-0.8209372 practice these skills +-0.8209372 All these skills +-0.53876007 and transferable skills +-0.53876007 fre countless skills +-0.53876007 readily apparent skills +-1.0412586 to finding skills +-0.9452047 students learn skills +-0.9126212 of workplace skills +-0.8392449 mastering basic skills +-1.3093265 to obtain skills +-0.53876007 which managerial skills +-1.0386164 are invaluable skills +-0.53876007 and employability skills +-0.53876007 and time-management skills +-0.962609 deem it important +-0.4996372 it is important +-1.0963347 job is important +-1.4032688 studying is important +-1.2694736 It is important +-0.9037935 Suffering is important +-0.9037935 timing is important +-0.9037935 eIt is important +-1.2794907 is not important +-0.966031 beneficial and important +-0.44223198 is very important +-0.7271976 college very important +-1.0157851 are very important +-0.7271976 fs very important +-0.96237063 almost as important +-1.8083255 is also important +-0.51687723 is an important +-1.437981 as an important +-0.87807935 now an important +-1.804116 not only important +-0.62290335 is so important +-0.8284021 it 's important +-0.79777956 is most important +-0.72786295 the most important +-0.9747011 The most important +-1.6892259 that are important +-0.9514374 things are important +-0.80976206 Another important +-0.61039853 is more important +-0.91796637 become more important +-0.9539171 evolve skills important +-0.84191966 are all important +-0.8838247 them teach important +-0.6830955 it fs important +-0.9467313 teach these important +-1.8778279 to learn important +-0.77396137 is extreme important +-0.5396396 not necessarily important +-0.77396137 is personally important +-0.93938166 is indeed important +-0.5396396 fs terribly important +-2.5457652 , the employer +-2.313727 The employer +-0.8883211 a potential employer +-1.2445872 skills may include +-0.93952686 Said activities include +-0.8184666 These factors include +-1.2703568 them the leadership +-0.7775762 may include leadership +-1.7434075 skills , commitment +-1.4434808 terms of commitment +-2.2403214 as a team +-1.270913 commitment , team +-0.70563835 , team spirit +-1.2677768 training in interpersonal +-0.9657898 spirit and interpersonal +-0.9657898 service and interpersonal +-2.540267 , the management +-0.87694967 , time management +-1.5480571 of time management +-0.90610564 to time management +-1.3789517 fs time management +-0.937816 personal financial management +-0.87756544 studying business management +-2.1401095 college student taking +-0.92122644 tuition by taking +-0.92122644 simply by taking +-0.96439314 management and taking +-1.2577398 themselves and taking +-1.5625224 costs of taking +-1.2526901 refrain from taking +-1.6749262 many students taking +-1.6015491 of students taking +-1.779791 you are taking +-0.9130451 truly enjoy taking +-0.89552355 tried just taking +-0.89552355 graduate since taking +-1.1723425 and taking criticism +-0.5417812 taking criticism positively +-0.966677 Besides the knowledge +-1.4358697 which the knowledge +-1.2442768 have more knowledge +-1.5166019 on my knowledge +-0.5413157 themselves acquiring knowledge +-0.8443649 gaining invaluable knowledge +-0.5413157 any REAL knowledge +-1.6165302 the experience gained +-0.95710313 time-management skills gained +-1.1193186 the knowledge gained +-1.4983077 after graduation through +-2.7804086 part time through +-0.7753227 knowledge gained through +-0.9320446 support themselves through +-0.59713745 are learned through +-0.8428676 money earned through +-0.95721966 support them through +-1.2033707 the society through +-0.54043275 are met through +-0.8040625 their way through +-0.8040625 my way through +-0.44136035 must go through +-1.3996488 Working through +-0.9195662 education right through +-0.8161305 to pass through +-0.7036732 environment goes through +-0.7753227 own kids through +-0.54043275 or daughters through +-1.2691678 through the educational +-1.4283475 fees and educational +-1.4283475 books and educational +-0.9680855 appreciate their educational +-0.9528268 of some educational +-0.8445425 of his\/her educational +-0.8446352 Attending tertiary educational +-2.1060765 a student who +-0.9389484 The student who +-0.96883214 People , who +-1.6444998 for students who +-1.2313132 , students who +-1.2448311 to students who +-0.86371535 from students who +-1.078006 The students who +-1.2197859 fellow students who +-0.86371535 than students who +-0.86371535 poor students who +-1.9906181 part-time jobs who +-1.1893197 of jobs who +-2.1460001 , or who +-0.702775 pursuing candidates who +-0.8997171 on employers who +-1.1244714 with people who +-1.2156218 of people who +-0.83519536 elderly people who +-0.83519536 employ people who +-1.1356956 People who +-0.8736195 the workers who +-0.91844666 on customers who +-0.3074312 of those who +-0.7106808 to those who +-0.605579 help those who +-0.605579 Beware those who +-0.702775 a worker who +-0.8926364 and families who +-1.232578 Students who +-0.8149672 that anyone who +-1.5013487 a person who +-0.8149672 my co-workers who +-1.1356956 Those who +-0.8149672 the groups who +-0.702775 are individuals who +-0.5398157 Anyone who +-0.9398239 restaurant patrons who +-0.81542623 , non-smokers who +-0.9653459 was not productive +-0.9680855 through their productive +-1.8633671 to become productive +-1.8810347 who are productive +-1.2446644 a more productive +-1.268888 do something productive +-2.700372 I developed +-2.2438025 can be developed +-0.9377899 with well developed +-1.4047122 people skills developed +-1.3439867 many other developed +-0.97076845 hamstring the overall +-0.84522486 well developed overall +-0.7054753 developed overall personality +-0.91436857 their individual personality +-0.54166937 materialistic inclined personality +-0.60063684 will be able +-0.83661413 not be able +-0.6809103 to be able +-0.9903599 may be able +-1.2472117 should be able +-1.0769796 only be able +-0.7372119 must be able +-0.81015027 n't be able +-1.2679627 would be able +-0.81015027 t be able +-1.937033 student is able +-0.9614467 everyone is able +-2.0280538 is not able +-0.9429684 was then able +-0.95055217 was only able +-1.5890334 and are able +-1.6126069 they are able +-0.90841216 you were able +-1.6266937 I was able +-0.7427083 Being able +-2.4882603 able to assimilate +-1.6058713 money for themselves +-0.95834273 identity for themselves +-1.2567652 earn it themselves +-2.0912766 are not themselves +-0.9561014 live by themselves +-0.87516296 such positions themselves +-0.54043275 to assimilate themselves +-0.87495136 students see themselves +-0.7036732 into keeping themselves +-0.88557863 ft teach themselves +-1.4843823 to support themselves +-1.1389889 to manage themselves +-0.7036732 likely manifest themselves +-0.54043275 can situate themselves +-0.54043275 only hurting themselves +-0.54043275 fully dedicating themselves +-0.54043275 without exposing themselves +-0.607943 be smokers themselves +-0.607943 the smokers themselves +-2.5293126 , and into +-0.9307448 assimilate themselves into +-0.6550727 to put into +-1.1738988 the effort into +-0.81542623 , fit into +-1.1133988 are entering into +-0.859884 until late into +-0.774821 translate directly into +-1.2262087 they enter into +-0.702775 and classrooms into +-0.5501161 and insight into +-0.5501161 added insight into +-0.5398157 about assimilation into +-0.5398157 their toes into +-0.5398157 be converted into +-0.81542623 students mature into +-1.0015004 be taken into +-0.702775 easily turn into +-0.5398157 valuable input into +-0.8149672 carry over into +-0.702775 more women into +-0.5398157 all energies into +-0.8387759 be divided into +-0.5398157 be pressured into +-0.5398157 deeply integrated into +-0.9578491 maintain this environment +-1.7146847 into the environment +-0.9660607 Even the environment +-0.9610377 smoke-free working environment +-0.9530137 his school environment +-1.9698844 the work environment +-0.9497381 her work environment +-0.9260394 choose an environment +-0.9260394 creating an environment +-2.1288831 This environment +-1.0456167 a healthy environment +-0.8410611 a smoke-free environment +-0.38033938 a smoky environment +-0.5409623 a smoke-filled environment +-2.0572052 can be learned +-2.1121125 to be learned +-2.1795077 they have learned +-0.9473085 practices you learned +-1.6168703 that are learned +-1.3519534 skills are learned +-0.93727094 ethics are learned +-2.0311003 They learned +-0.7050885 the theory learned +-2.0506668 for a position +-1.6053771 such a position +-1.2457142 secure a position +-0.9605936 through part-time position +-0.887796 a graduate position +-0.7050885 for each position +-0.77699226 the manager position +-0.54140407 and humble position +-0.96544003 position will expand +-0.96997774 expand and increase +-0.9484744 but could increase +-0.8966985 would never increase +-0.5415809 the continual increase +-1.566105 skills and abilities +-2.3120105 in their abilities +-1.6613173 one 's abilities +-0.9576363 try it at +-2.0127695 will be at +-0.95615315 works part-time at +-2.385743 it is at +-0.95421904 mind is at +-1.9985424 are not at +-1.4216887 interested in at +-1.057089 and study at +-1.4531772 to study at +-0.8508254 doing study at +-1.2621901 for example at +-1.5873288 a little at +-0.95070475 students experience at +-1.1230037 for working at +-0.89062965 are working at +-0.89062965 continue working at +-1.7981873 their time at +-0.94447577 my time at +-2.2044094 part-time jobs at +-1.2536495 can work at +-1.1335274 not work at +-1.7184393 to work at +-0.89676166 real work at +-0.9319394 study then at +-2.084779 of money at +-1.5304801 the most at +-0.8859713 business degree at +-0.9353974 government or at +-0.9353974 schools or at +-0.87775147 have learned at +-0.7687077 and abilities at +-0.90377986 on food at +-1.1942582 are good at +-0.6818618 part-time while at +-1.1303331 job while at +-0.6818618 tuition while at +-0.6818618 workplace while at +-0.94956976 life all at +-0.934515 school because at +-1.1776385 also provide at +-0.8531268 are usually at +-0.7687077 are staying at +-0.8531268 up late at +-1.2890396 of customers at +-0.89322567 in success at +-0.90771955 free days at +-1.3690379 a week at +-0.9068736 going home at +-0.86740685 they obtain at +-0.87043726 financial discipline at +-0.5365689 who interns at +-0.80886924 time needed at +-1.0607953 and live at +-0.5365689 be impressed at +-0.88658917 years spent at +-1.0332845 the public at +-1.169414 to look at +-0.8859713 puts me at +-0.8348791 a goal at +-1.4555247 be banned at +-0.8398179 as dealing at +-0.5365689 tour guides at +-0.5365689 family unit at +-0.5365689 a course-load at +-0.6980578 greater population at +-0.5365689 quite shocked at +-0.83233684 to perform at +-0.6980578 left solely at +-0.5365689 be mesmerized at +-0.5365689 of winning at +-0.5365689 Troubles at +-1.1238561 to eat at +-1.4634807 on smoking at +-0.93096304 inhaling smoke at +-1.741608 , this same +-1.9209192 is the same +-1.4147052 in the same +-1.488517 of the same +-0.6336835 at the same +-0.4505812 At the same +-1.7929323 same time benefiting +-2.0477881 not be financially +-0.9692575 professionally and financially +-1.198633 to become financially +-0.9337215 keeping themselves financially +-0.54122734 time benefiting financially +-1.8013906 A financially +-1.2081527 Being financially +-0.54122734 to succeed financially +-1.7489916 statement for two +-0.95956486 mainly for two +-1.647099 I have two +-1.4942087 the following two +-0.91353816 There are two +-0.94377345 divided into two +-1.6925907 For these two +-1.0172796 Reason two +-1.546171 when I first +-1.257892 make it first +-2.0267794 for the first +-1.4526637 be the first +-1.823614 is the first +-2.0024896 in the first +-2.0019011 to the first +-1.3550861 are the first +-1.2081964 probably the first +-0.91522247 students their first +-1.9524171 on their first +-1.2968369 have their first +-1.165879 get their first +-0.91522247 enter their first +-1.5193979 to experience first +-0.9167577 also gain first +-1.1378605 The first +-1.6233823 that you first +-0.9202279 have your first +-0.93920004 context must first +-0.92067075 own way first +-0.91870147 had my first +-0.91870147 remember my first +-1.3600122 My first +-2.0222545 is the issue +-2.3548265 to the issue +-2.2021053 on the issue +-0.95916915 not an issue +-1.9369822 the money issue +-0.946515 important social issue +-0.77699226 a big issue +-0.54140407 a safety issue +-0.9710474 examinations , Many +-2.71242 college students complain +-1.965147 I don +-1.6357412 , I don +-1.7295498 students that don +-1.384283 that they don +-1.9166504 if they don +-1.1975414 since they don +-1.5638076 students who don +-1.4838587 those who don +-1.2363868 jobs we don +-1.8565267 they can ft +-1.3725996 This can ft +-0.07205923 I don ft +-0.18486251 that don ft +-0.07205923 they don ft +-0.2218242 who don ft +-0.18486251 we don ft +-0.34211367 you didn ft +-0.34211367 University didn ft +-0.541139 there wasn ft +-0.541139 , isn ft +-0.541139 I couldn ft +-0.541139 or aren ft +-0.704702 it won ft +-0.541139 Don ft +-0.94291425 Not having enough +-0.9673534 graduating is enough +-1.7016393 is not enough +-1.1800374 have not enough +-0.92311543 simply not enough +-1.4981924 ft have enough +-1.382527 They have enough +-0.9275952 wasn ft enough +-0.9402063 were good enough +-0.89437205 ft earn enough +-0.7036732 is stressful enough +-0.7753227 anyone fortunate enough +-0.7753227 force soon enough +-1.2420326 to save enough +-0.7753227 moving fast enough +-0.46936065 also old enough +-0.46936065 are old enough +-0.54043275 rarely adequate enough +-0.54043275 as devoting enough +-0.54043275 and enlightened enough +-0.7036732 it justified enough +-2.0004852 they can spend +-1.6321256 and not spend +-1.8970238 , and spend +-1.2566923 together and spend +-1.7603564 student to spend +-2.1205804 students to spend +-1.8006306 time to spend +-1.8228074 have to spend +-1.6349258 money to spend +-1.8896284 able to spend +-1.7368354 how to spend +-1.7035109 want to spend +-1.496278 had to spend +-2.083779 , students spend +-1.7886604 College students spend +-1.4044526 student should spend +-1.2069057 students should spend +-1.3283519 They should spend +-1.6475402 time they spend +-1.578204 To spend +-1.2338916 Students who spend +-1.3773141 I could spend +-1.2095486 they must spend +-0.8757142 is either spend +-1.4057393 , we spend +-0.90457416 as we spend +-2.1070404 with the food +-2.1194887 on the food +-1.2412089 what the food +-1.5124046 appreciate the food +-0.9677393 rent , food +-0.9677393 expensive , food +-0.7550275 taste of food +-1.2695328 close to food +-0.96491635 even on food +-0.7768402 of fast food +-0.7768402 only buy food +-0.90350926 , several times +-1.2671331 than in times +-0.96125305 food at times +-0.9602002 Often times +-0.70521736 long vacation times +-0.96518874 Firstly it gives +-1.6152257 working part-time gives +-2.7239869 part-time job gives +-0.9704541 treasurer , gives +-0.9573194 it also gives +-2.1357827 This gives +-1.2855114 credit cards gives +-0.7048308 same dorm gives +-1.4693154 of this need +-0.9236387 recognize this need +-1.909512 have the need +-0.96513784 : the need +-0.9630958 he\/she will need +-0.9379226 backgrounds often need +-0.9677806 more in need +-0.96764123 between and need +-1.5452029 that students need +-1.6418297 , students need +-1.4314951 some students need +-1.3202331 Many students need +-1.5258871 College students need +-1.2462543 where students need +-1.5112606 they may need +-0.9516817 experience they need +-0.9516817 all they need +-1.3729242 when you need +-1.2326082 jobs who need +-1.7261695 don ft need +-1.3326976 have no need +-1.4825444 work force need +-1.6570023 they would need +-1.5752801 , we need +-0.7753227 we essentially need +-1.4138292 such an emergency +-1.9190799 it is earned +-1.4582977 they have earned +-1.9395137 the money earned +-0.9133842 it be good +-1.5988303 also be good +-1.1626104 ft be good +-1.6482654 would be good +-1.6267211 with a good +-1.7321067 for a good +-1.3368195 is a good +-1.6828386 to a good +-1.3940566 also a good +-0.9196823 what a good +-0.9196823 securing a good +-0.9196823 landing a good +-1.4150039 restaurant is good +-1.684218 college is good +-2.5029414 for the good +-1.507238 so many good +-1.4362842 well and good +-0.9693064 skills to good +-1.3732891 a very good +-1.5836222 jobs are good +-2.1429422 they are good +-1.256222 Another good +-0.95569086 done more good +-1.6081334 are all good +-0.816297 Yet another good +-1.5088508 It fs good +-0.90709645 parents were good +-0.92305744 essentially getting good +-0.9582416 though this idea +-1.531088 with the idea +-2.3445199 to the idea +-1.251605 support the idea +-1.324955 with little idea +-1.4165332 students an idea +-0.8958022 a bad idea +-1.6501068 a better idea +-0.7807752 a good idea +-0.86992687 very good idea +-1.9622105 student fs idea +-0.704702 a clear idea +-1.8513768 The alternative +-0.96952754 alternative is asking +-0.6216845 work with others +-0.9465366 cooperation with others +-1.5127591 too many others +-1.2678394 work of others +-0.9647886 depend on others +-0.54122734 is asking others +-0.8408907 with what others +-1.041224 know what others +-0.8408907 Learning what others +-0.9415853 simply because others +-0.54122734 as innumerable others +-2.3346918 it is like +-1.7608058 which is like +-0.9503656 world is like +-2.2744412 and the like +-1.268922 others , like +-0.95947033 are jobs like +-0.9423709 job -LRB- like +-0.93667525 free society like +-0.7763553 Many countries like +-0.540874 Skills like +-0.70431596 may seem like +-0.84088486 I fd like +-0.5439482 it seems like +-0.4696766 It seems like +-0.540874 really tastes like +-2.203667 believe that parents +-2.3872576 , the parents +-2.5675411 of the parents +-0.95626426 financed by parents +-1.2561839 with their parents +-0.7982185 by their parents +-1.2212183 and their parents +-1.43866 of their parents +-1.1730635 to their parents +-1.2463851 from their parents +-0.70240134 on their parents +-0.8148489 if their parents +-0.8148489 what their parents +-0.8148489 If their parents +-0.8148489 either their parents +-0.8148489 ask their parents +-0.8148489 relieve their parents +-1.5898677 Even if parents +-1.57691 student 's parents +-0.9227331 , like parents +-1.1143057 to your parents +-0.8037615 namely your parents +-2.0331855 If parents +-1.9475929 student fs parents +-0.9553752 up my parents +-0.9415976 those whose parents +-1.3774618 All parents +-0.93182623 shows our parents +-0.93819946 -LRB- his parents +-0.84276944 make his\/her parents +-0.94625086 true with friends +-1.3756826 out with friends +-1.2666358 people and friends +-0.96924686 circle of friends +-1.6341349 parents or friends +-1.1812391 meet new friends +-1.1730202 with my friends +-1.3926731 of my friends +-1.3616256 My friends +-0.9330992 with our friends +-0.8762872 life long friends +-0.5410506 equally dissolute friends +-2.3674161 should be doing +-0.96763766 who is doing +-2.3901072 the student doing +-1.6105182 jobs , doing +-1.8886168 However , doing +-1.6036335 classes , doing +-1.8256295 Finally , doing +-2.0960011 are not doing +-1.67648 job and doing +-1.3833244 working and doing +-1.9119481 money and doing +-0.9490781 suffering and doing +-0.96860534 love of doing +-0.9610221 them from doing +-1.9717906 their time doing +-1.5370073 class or doing +-2.1264775 students are doing +-1.4546872 By doing +-0.94075286 things while doing +-0.94409525 ended up doing +-1.1300274 time spent doing +-0.970158 again and again +-0.9454653 though there again +-0.9537528 doing so again +-0.96979755 market and put +-2.1149364 able to put +-1.8123507 need to put +-0.7890246 chance to put +-0.95223993 where to put +-0.95223993 aside to put +-0.957354 again may put +-1.6828659 help them put +-0.70521736 you fve put +-0.95353854 put some strains +-2.3601747 on the relationships +-0.70521736 in interpersonal relationships +-0.70521736 developing crucial relationships +-0.6543796 , workplace relationships +-0.6543796 successful workplace relationships +-0.6543796 understand workplace relationships +-0.5414925 Maintaining friendly relationships +-1.3228685 work with people +-0.91617036 team with people +-1.5634973 dealing with people +-1.2412227 common for people +-1.2412227 respect for people +-0.9642925 so that people +-2.0524366 with the people +-2.1292012 , the people +-2.2404306 of the people +-1.8790464 from the people +-1.4281087 , many people +-1.3701215 so many people +-1.679733 students and people +-0.9596159 ethic and people +-1.5192499 lot of people +-1.5192499 kind of people +-1.4487689 variety of people +-0.93727815 failures of people +-1.2059739 thousands of people +-0.9575465 Learning from people +-0.9608983 else as people +-1.5171813 Such people +-1.3112869 and less people +-0.96215737 At college people +-1.2125452 for most people +-1.2795508 Most people +-1.2134234 Many people +-1.5738908 , when people +-0.8378567 for other people +-0.8378567 affect other people +-0.9536442 indeed all people +-1.7544272 to make people +-0.93266106 to where people +-0.8580218 with different people +-0.5386722 Young people +-0.9637222 Some people +-0.68194395 : Some people +-0.88210607 will meet people +-1.1712654 meet new people +-0.925938 how hard people +-0.4732363 that young people +-0.3239365 often young people +-0.3239365 to young people +-0.3239365 on young people +-0.3239365 -RRB- young people +-0.3239365 Most young people +-0.3757097 are young people +-0.20915101 help young people +-0.3239365 other young people +-0.3239365 all young people +-0.3239365 If young people +-0.3239365 give young people +-0.77230334 fully rounded people +-1.3649172 All people +-1.1312034 Those people +-0.70111173 , meeting people +-0.5386722 meeting viable people +-0.5386722 middle aged people +-0.5386722 even elderly people +-0.5386722 who employ people +-0.5386722 to urge people +-1.0819182 the rights people +-2.188469 they have borrowed +-0.9480385 people you borrowed +-2.4251864 the student eventually +-1.7103466 students and eventually +-0.96528363 tired and eventually +-1.2440147 They may eventually +-1.2364699 jobs who eventually +-0.959137 we all eventually +-2.4215481 the student see +-0.9673672 others to see +-0.9673672 traveling to see +-2.1019545 the students see +-2.169887 if they see +-0.8632097 may eventually see +-1.187737 ft really see +-2.2413182 as a evampire +-0.5417812 a evampire f. +-0.54174346 evampire f. This +-2.0188417 This is especially +-2.2847667 job , especially +-2.6527693 , and especially +-0.81796503 very carefully especially +-0.9031918 many employers especially +-1.2482055 many people especially +-0.9689191 contrary is true +-2.3205123 in the true +-0.9617173 some the true +-0.9617173 load the true +-0.86329037 is especially true +-0.81796503 be particularly true +-0.93394995 understating our true +-0.54140407 f rings true +-2.1018894 Secondly , College +-0.9567716 Too many College +-0.9384203 graduate from College +-0.9384203 fully from College +-1.2242466 and will show +-0.9470251 skills will show +-1.1523242 need to show +-1.2458496 could also show +-1.4636977 that they fre +-1.6316563 that you fre +-2.014209 There fre +-1.0063916 they fre putting +-0.54174346 already started putting +-2.1557462 and the effort +-2.4666588 of the effort +-0.9620223 putting the effort +-1.218777 without any effort +-1.7371128 time and effort +-1.565553 lack of effort +-0.9655786 skills as keeping +-0.94453824 effort into keeping +-1.7393312 become a stable +-0.89702827 themselves financially stable +-1.4357537 parents are willing +-1.5216873 are more willing +-0.5846666 job can help +-1.2329338 and can help +-0.88753194 coursework can help +-0.88753194 relief can help +-1.0391723 experience will help +-1.0391723 studies will help +-0.83959603 developed will help +-0.83959603 It will help +-0.83959603 workplace will help +-0.83959603 hierarchies will help +-1.4254817 did not help +-1.3848963 experience to help +-1.5702535 money to help +-1.6550907 order to help +-1.4647224 enough to help +-1.1683137 willing to help +-0.91658795 activities to help +-0.91658795 loans to help +-1.7009193 going to help +-0.91658795 contacts to help +-0.91658795 week to help +-0.91658795 network to help +-0.91658795 references to help +-1.8182895 well as help +-0.9203958 jobs also help +-0.9203958 they also help +-0.92991406 such jobs help +-0.92991406 Real-world jobs help +-1.2417325 and may help +-0.8965162 job should help +-1.1331041 work should help +-0.8965162 activities should help +-1.2605217 then they help +-1.2323583 that could help +-0.887272 industry could help +-0.8619134 will usually help +-0.8863604 will perhaps help +-2.170949 believe that when +-2.0733106 part-time job when +-1.9974141 that is when +-1.5112284 cases , when +-1.4012142 year , when +-1.5112284 addition , when +-0.62516165 Also , when +-1.6471226 their field when +-0.9638933 assignments and when +-1.3871418 their study when +-1.7107412 most students when +-1.6340408 a time when +-1.8266197 in school when +-0.93693215 time -LRB- when +-0.9475876 chores so when +-0.9383143 But even when +-0.9513257 opportunity more when +-1.1997843 do better when +-1.1594872 much better when +-1.1530331 of food when +-0.9538185 All people when +-1.8320426 to help when +-1.5202897 to do when +-1.3639759 an opportunity when +-0.7730377 many distractions when +-0.8395129 better grades when +-0.53858435 is useless when +-1.7033222 it fs when +-1.2245153 tertiary education when +-1.2624828 to quit when +-1.144322 our lives when +-0.9323189 interesting things when +-0.7721529 to discover when +-0.89179134 study later when +-0.7009841 this fact when +-0.8126497 important stage when +-0.7009841 a car when +-0.53858435 a bubble when +-0.53858435 once established when +-0.53858435 importance whatsoever when +-0.7009841 quite ill when +-0.53858435 withdrawal pains when +-1.6653783 working , trying +-1.9747503 they are trying +-1.6326073 you are trying +-0.93793696 never-the-less are trying +-1.7089026 I do +-1.4862304 reasons I do +-1.7146312 jobs that do +-0.8598181 , often do +-1.173763 students often do +-1.420439 did not do +-1.7792387 job and do +-1.4738201 student to do +-1.4167297 , to do +-1.4516227 and to do +-1.3054724 has to do +-1.0889852 as to do +-1.2153151 have to do +-1.4025971 money to do +-1.5313219 able to do +-1.261049 trying to do +-1.4580126 them to do +-1.261049 expected to do +-0.8703854 Learning to do +-1.0920277 how to do +-1.3819975 difficult to do +-1.261049 hard to do +-0.53685457 want to do +-1.0889852 freedom to do +-0.8703854 needed to do +-1.261049 allowed to do +-1.1958237 right to do +-1.1958237 choose to do +-1.333337 wish to do +-0.9407659 -RRB- students do +-2.30705 college students do +-1.4741101 university students do +-0.95964414 companies should do +-1.8081673 not only do +-1.6434685 that they do +-1.4436612 and they do +-1.0317765 if they do +-0.8884553 or they do +-1.1192975 If they do +-1.5690554 To do +-0.9412867 they even do +-1.0267771 students who do +-1.4736938 those who do +-0.9564794 Such people do +-0.86006814 who eventually do +-0.9525707 we would do +-0.89283496 Many families do +-1.74974 in Japan do +-1.1400255 college age do +-0.83880484 If we do +-1.1746054 believe we do +-0.83880484 everything we do +-0.77441466 in general do +-0.5399039 study programs do +-0.5399039 Nor do +-1.6365374 this is something +-2.343354 it is something +-1.8042588 , is something +-0.94945806 years studying something +-0.9434247 work -LRB- something +-0.87733936 or understanding something +-0.9373134 of doing something +-1.2523851 to do something +-1.5377971 they want something +-0.96834993 put in your +-1.2529019 unrelated to your +-2.0722487 related to your +-0.9619243 prove to your +-0.9642781 something on your +-1.9369686 , if your +-0.9461642 lose but your +-1.746008 to pay your +-1.4279255 you have your +-0.3419854 , expanding your +-0.3419854 by expanding your +-0.70431596 thus increasing your +-1.1165215 , earning your +-0.70431596 and enjoying your +-0.84088486 , namely your +-1.5729471 having to earn +-1.2795402 job to earn +-1.1530191 working to earn +-1.9602752 students to earn +-1.6790457 have to earn +-0.907955 part to earn +-0.907955 ; to earn +-1.7228308 able to earn +-1.5729471 need to earn +-1.3599916 start to earn +-1.1530191 just to earn +-1.3599916 means to earn +-0.907955 takes to earn +-1.1530191 required to earn +-1.2624063 than they earn +-1.7384325 don ft earn +-0.877244 they actually earn +-1.2363868 When we earn +-1.2639482 possibilities for other +-1.5624342 on the other +-0.6290242 On the other +-0.86113626 of many other +-1.3344147 are many other +-0.86113626 mention many other +-1.0931585 from any other +-1.0931585 or any other +-1.4375072 interest in other +-1.7024885 time and other +-0.96385974 asthma and other +-1.1918678 lead to other +-2.2722275 The other +-1.9565477 , or other +-1.560199 parents or other +-0.9076598 really know other +-1.5996757 , what other +-1.2062098 worrying about other +-0.70418733 can affect other +-1.6115773 working part-time while +-1.8823237 a job while +-1.2961907 part-time job while +-1.2428509 time job while +-1.267712 them , while +-0.8568776 with working while +-1.26963 that working while +-1.2285646 and working while +-1.3256446 from working while +-2.7787077 part time while +-2.2930973 part-time jobs while +-1.1420693 not work while +-1.7437913 to work while +-0.90169334 students work while +-0.90169334 definitely work while +-0.91191846 and tuition while +-1.9950924 their parents while +-0.54034454 , undertaken while +-1.4818256 work force while +-1.4508681 So while +-1.1406424 never worked while +-1.4888861 the workplace while +-1.2028947 many things while +-0.7035448 get married while +-0.54034454 and mentors while +-0.8855846 not spending while +-1.6607387 are a major +-1.4426589 study and major +-2.163143 in their major +-2.163143 to their major +-0.5615375 to your major +-0.8055107 if your major +-1.7628698 is an advantage +-1.1885792 have an advantage +-1.2677655 major is teaching +-0.7775762 and concerned teaching +-1.5411278 it can take +-0.96386224 it will take +-0.9640375 concentrate and take +-0.9640375 dormitory and take +-1.7273369 and to take +-2.1636844 students to take +-1.9349494 able to take +-0.9362155 first to take +-0.9362155 Having to take +-1.2040032 parents to take +-0.9362155 bound to take +-0.9362155 tempting to take +-1.6179954 we should take +-1.3319018 will only take +-1.1353652 should only take +-0.96592593 sure they take +-0.9461695 and you take +-0.93937165 and must take +-1.2381686 job would take +-0.89502466 where families take +-1.0038619 can easily take +-2.7515192 part-time job tutoring +-0.96382576 combines with high +-1.5434291 also a high +-0.9642957 than a high +-1.2653232 than in high +-1.796114 out of high +-1.2221891 the very high +-0.9613394 graduating from high +-2.344471 they are high +-0.94282675 and into high +-0.54078573 job tutoring high +-1.3313252 is too high +-0.8407087 Getting high +-0.70418733 become extremely high +-0.54078573 Achieving high +-1.6088194 and a chance +-1.3475605 students a chance +-2.220086 have a chance +-1.4589789 them a chance +-0.93557495 -- a chance +-1.2028171 had a chance +-1.2527953 student the chance +-2.1535385 and the chance +-1.4243748 reduce the chance +-1.6597472 the first chance +-1.0473138 the last chance +-0.88792825 their best chance +-1.2571914 They can use +-2.3966527 students to use +-1.4129038 jobs to use +-1.9265647 how to use +-1.7663126 learn to use +-0.9266804 of little use +-0.966284 doubt they use +-1.381893 put into use +-0.9419147 to good use +-1.3587835 students must use +-1.2320743 the best use +-0.541139 would normally use +-2.6509278 of the theories +-0.9674895 put the theories +-0.97038597 theories and practices +-0.9585869 fees for classes +-0.9585869 late for classes +-2.4003224 in the classes +-1.6383449 all the classes +-1.2683165 i.e. , classes +-1.2648718 spent in classes +-2.0956454 money and classes +-0.96860534 load of classes +-2.0864441 going to classes +-2.1396797 to their classes +-0.95759577 pass their classes +-0.94801706 but such classes +-1.3997661 are some classes +-0.9168661 just taking classes +-0.95556235 taking my classes +-1.0151691 to attend classes +-0.84035665 Attending classes +-1.003531 of attending classes +-2.7078164 I fll +-0.7434316 , you fll +-2.0359368 They fll +-1.9179007 they will know +-1.6387419 They will know +-1.9100233 , to know +-0.9028044 Many employers know +-2.0279055 They know +-1.0048888 you fll know +-0.6629386 do n't know +-0.9269427 only really know +-0.9528614 before we know +-1.252457 what I what +-0.9617485 familiar with what +-2.030332 , for what +-1.3911371 studying , what +-1.8452226 However , what +-1.3911371 Thus , what +-0.9519362 growth , what +-0.9519362 ourselves , what +-2.1273615 , and what +-0.947022 you and what +-1.3777586 well and what +-0.947022 enjoy and what +-0.9384727 little of what +-1.8242611 part of what +-0.5105871 idea of what +-1.4673829 taste of what +-0.9384727 sample of what +-0.9649413 in to what +-0.9649413 closer to what +-1.5241202 different from what +-1.6991237 focus on what +-0.9184386 take on what +-1.3903593 based on what +-0.93922484 applied -LRB- what +-2.137216 , or what +-2.1078327 to do what +-0.7662962 fll know what +-0.9990991 n't know what +-1.084998 , nor what +-1.0120226 Learning what +-0.93933046 figuring out what +-1.3764443 also learn what +-0.70239055 to sacrifice what +-0.8740108 and appreciate what +-1.0431069 us realize what +-0.7738104 and yet what +-0.8144695 student determine what +-0.7744382 that choosing what +-0.87305 show us what +-0.53955156 to payback what +-0.53955156 also destroying what +-0.53955156 to tell what +-0.9539341 if one works +-1.2373741 know what works +-1.5055339 and what doesn +-1.078522 who don ft. +-0.54166937 what doesn ft. +-0.54166937 I hadn ft. +-1.9124578 , I could +-1.1725967 : I could +-1.1725967 wish I could +-1.7182425 , this could +-0.9427243 that that could +-1.597595 jobs that could +-0.9427243 energy that could +-2.6606905 part-time job could +-1.945166 example , could +-0.9663883 jobs and could +-1.3117808 the course could +-1.3552831 of study could +-0.9561883 Work experience could +-2.0958757 their studies could +-2.2795806 part-time jobs could +-1.9620247 , but could +-0.8849789 Those experiences could +-1.1365458 the industry could +-0.87460303 fs parents could +-0.87460303 his parents could +-1.6623164 This could +-1.2126974 your major could +-1.0859262 , nor could +-0.8739197 minted graduates could +-0.702775 A solution could +-0.5398157 or gifted could +-0.5398157 Universities could +-0.5398157 Which could +-0.9398239 a man could +-2.0023398 money , expanding +-0.9585228 you by expanding +-1.1787019 expanding your horizons +-0.7775762 horizons thus increasing +-0.8186068 the ever increasing +-0.9498921 to job opportunities +-0.9498921 your job opportunities +-0.9708217 leading to opportunities +-2.313727 The opportunities +-1.8792711 a job after +-1.5167912 full-time job after +-1.3170316 first job after +-2.153923 It is after +-2.3703437 the student after +-1.0983156 of time after +-1.8877031 to work after +-1.411979 time work after +-1.1845593 extra work after +-1.6299971 may have after +-0.8934455 job offer after +-0.94156766 pressured into after +-0.93334645 soon enough after +-2.1190958 to do after +-0.4033013 job opportunities after +-1.5972294 at all after +-0.9566187 awaits them after +-1.6118935 for life after +-1.2269013 an adult after +-0.5400801 will encounter after +-0.43169218 wait until after +-1.2894671 at home after +-0.8393022 I got after +-0.5400801 for teenagers after +-0.5400801 in Massachusetts after +-1.1477121 In conclusion +-2.3945477 , I encourage +-1.8865801 is to encourage +-1.4356328 begin to encourage +-0.9620633 Parents should encourage +-0.9450812 possibly even encourage +-1.4388012 reason for all +-1.165006 but for all +-1.0359638 pay for all +-2.1599114 important for all +-0.9147321 grandparents for all +-1.421735 find it all +-2.2855616 should be all +-1.4374133 Also , all +-1.6932291 can not all +-2.0002034 In all +-1.2249509 successful in all +-0.21555878 banned in all +-0.6849079 smoking in all +-1.3513527 top of all +-1.3513527 those of all +-1.3513527 risk of all +-0.9370397 First of all +-0.9370397 perversion of all +-1.9083741 For all +-0.9539569 an experience all +-1.1983999 and from all +-1.1983999 banned from all +-0.8393538 being tired all +-1.5632917 to focus all +-2.1037557 , or all +-0.89466 Evaluations are all +-1.1299086 Internships are all +-0.89466 responsibilities are all +-0.89466 superiors are all +-0.89466 rejected are all +-0.99403924 is at all +-0.8105507 not at all +-0.8105507 in at all +-1.1265035 work at all +-0.8105507 banned at all +-0.8301181 should spend all +-1.471925 to put all +-1.3958421 with people all +-1.981176 They all +-0.95066875 opportunity when all +-0.8972057 they use all +-0.9426458 is after all +-0.83871865 I encourage all +-1.6694065 for them all +-2.0047634 I believe all +-1.3944409 social life all +-1.0896369 can explore all +-0.83615386 Should all +-0.9099871 they understand all +-1.3787414 that we all +-1.107935 , complete all +-0.8393538 burden within all +-0.8124847 and cover all +-0.5384965 which covered all +-0.5384965 therefore behooves all +-0.7720026 , indeed all +-0.5384965 , standing all +-0.5384965 popular amongst all +-1.5465672 I disagree because +-1.3536438 this statement because +-1.0251058 the statement because +-0.96617705 case that because +-2.668116 part-time job because +-1.7477438 reason is because +-1.4190751 jobs is because +-2.175068 time , because +-2.2840395 part-time jobs because +-1.6563098 high school because +-1.765054 after college because +-1.5561395 during college because +-1.2385018 much more because +-0.8391267 an advantage because +-0.8739996 of workers because +-0.7030314 is unfortunate because +-1.5111939 But because +-1.1725873 and responsibility because +-0.8739996 well-being simply because +-1.47387 be banned because +-0.7745659 physical activity because +-0.7745659 left behind because +-0.8152993 eat properly because +-1.0013055 public places because +-0.5399919 all ages because +-1.2519023 but the opportunity +-1.8794577 have the opportunity +-1.2519023 them the opportunity +-2.2924817 The opportunity +-0.59520286 students an opportunity +-1.2189987 such an opportunity +-0.88118565 are an opportunity +-1.7244433 an important opportunity +-0.862839 their educational opportunity +-1.770952 a good opportunity +-1.1955028 has no opportunity +-0.54122734 a wonderful opportunity +-1.249389 cope with them +-1.2645844 is for them +-2.1044948 important for them +-1.3420541 opportunity for them +-1.1417947 done for them +-1.1417947 right for them +-0.90153533 smoky for them +-1.7088569 some of them +-0.96079457 hundreds of them +-1.3960886 up to them +-1.8580261 lead to them +-0.95373076 unavailable to them +-0.95373076 assigned to them +-1.5177864 different from them +-1.239849 will benefit them +-0.7093446 , gives them +-0.7093446 This gives them +-0.95656395 can help them +-0.94670236 will help them +-0.8517212 to help them +-0.8833098 should help them +-0.8280027 could help them +-1.1885191 only take them +-0.838894 even encourage them +-0.88149714 to teach them +-0.62535644 will give them +-0.28678316 and give them +-0.61669636 will make them +-0.7721529 and allowing them +-0.88986665 would force them +-0.34087554 this prepares them +-0.34087554 and prepares them +-0.7721529 by giving them +-0.53858435 that awaits them +-0.92397624 can support them +-0.32026902 it allows them +-0.39176363 part-time allows them +-0.4529322 job allows them +-0.39176363 It allows them +-0.8719202 and let them +-0.7721529 better enables them +-0.39075708 and teaches them +-0.39075708 time teaches them +-0.24517494 It teaches them +-0.7009841 has brought them +-0.5420023 , preparing them +-0.46803603 Thereby preparing them +-0.53858435 n't kill them +-0.8126497 go serve them +-0.93673736 may cause them +-0.84065425 that interest them +-0.3603942 will assist them +-0.648938 and assist them +-0.7721529 help prepare them +-0.53858435 and encouraging them +-0.53858435 is treating them +-0.53858435 this distracts them +-0.53858435 scholarship fed them +-0.53858435 clothes impress them +-0.53858435 and guide them +-0.53858435 which suits them +-1.9789475 for their present +-0.5417812 their present situations +-0.96877724 also their futures +-0.9676906 idea that Japanese +-1.267329 percentage of Japanese +-0.9700051 pertain to Japanese +-1.3951089 that if Japanese +-0.94578886 entering most Japanese +-1.2681019 As Japanese +-0.8762872 honed workers Japanese +-1.2180729 the young Japanese +-0.5410506 a typical Japanese +-1.5162457 too many distractions +-0.92797023 as little distractions +-1.2044742 have enough distractions +-0.9681654 replacement for real +-2.0475821 for a real +-2.2450676 is a real +-1.2451998 obtain a real +-2.0340137 for the real +-1.6261779 of the real +-1.677923 to the real +-1.2077136 from the real +-0.939197 students the real +-1.209543 what the real +-0.939197 how the real +-1.6350857 , not real +-1.9257386 are not real +-2.0223923 job in real +-1.969165 for their real +-2.2924817 The real +-1.7174667 there are real +-0.96906173 onus is therefore +-1.2647392 cards , therefore +-0.9679412 choices , therefore +-1.9083064 , and therefore +-0.96528363 enough and therefore +-2.0977995 student to therefore +-2.0932171 It therefore +-1.3718724 and can provide +-0.74178433 jobs can provide +-1.6316545 jobs will provide +-1.5440823 will not provide +-0.9667884 appear to provide +-0.9667884 compelled to provide +-1.61186 can also provide +-1.5061487 but also provide +-1.5861582 part-time jobs provide +-0.8244096 Part-time jobs provide +-0.9427098 and thus provide +-1.6430008 money they provide +-0.9523749 ; they provide +-0.86228335 Japanese colleges provide +-1.4070458 study would provide +-1.1949152 college years provide +-2.1065567 will be useful +-1.2644931 anything that useful +-2.0822165 be a useful +-1.8601147 that will useful +-1.3769163 be very useful +-1.9498975 young people useful +-1.6129606 are all useful +-0.93153846 not provide useful +-0.704702 be extremely useful +-1.6019509 of this society +-2.5837984 of the society +-2.457717 to the society +-1.2303332 positions in society +-0.9502293 behave in society +-0.9502293 involvement in society +-0.9564748 good of society +-0.7480461 member of society +-1.5144726 members of society +-0.9665571 valuable to society +-1.262002 back to society +-2.282181 to their society +-0.79470503 Japanese society +-0.63544023 As Japanese society +-0.63544023 workers Japanese society +-1.4011805 today fs society +-0.8620951 a free society +-0.54078573 the broader society +-0.3802507 in modern society +-0.54078573 fs consumerist society +-0.54078573 the @ society +-2.314604 in their specialized +-0.8186068 is highly specialized +-0.9545869 student be expected +-1.9340057 not be expected +-2.125482 are not expected +-1.7852325 They are expected +-2.1114297 them to function +-1.5571167 expected to function +-2.4549673 in a broad +-0.97076845 While the range +-0.54174346 a broad range +-1.269442 range of contexts +-2.315096 , it must +-1.688601 college student must +-0.908978 A student must +-0.908978 Each student must +-1.724643 home and must +-1.928407 college students must +-0.9567568 So students must +-1.6598275 but also must +-1.3906181 , one must +-1.2333263 this they must +-0.9517971 stressful they must +-2.0034525 There must +-0.92615986 Young people must +-1.7183925 young people must +-0.84018075 Workers must +-0.8751419 all workers must +-1.2352184 Students must +-0.84302795 , she must +-1.5760257 , we must +-0.54052097 's context must +-1.6503577 must be familiar +-0.9697608 could of done +-0.94828415 job has done +-0.9246443 everything being done +-0.44723547 others have done +-0.93760765 have things done +-0.87712383 get anything done +-1.4956698 in this area +-0.92618436 on this area +-2.1653402 in their area +-2.1653402 to their area +-1.817031 one fs area +-2.080375 field of specialty +-1.4327637 area of specialty +-0.96906173 labor is n't +-0.92420197 often do n't +-0.99438626 students do n't +-0.62604094 they do n't +-0.76353276 even do n't +-0.76353276 Japan do n't +-0.8880774 It did n't +-0.3422847 it wo n't +-0.3422847 customers wo n't +-0.3422847 they ca n't +-0.3422847 Customers ca n't +-1.259224 whatever I whatever +-0.96854 cash for whatever +-0.9707589 done , whatever +-1.4285406 it on whatever +-2.1506422 to do whatever +-2.4961555 time job might +-0.95888126 flipping experience might +-0.9666423 whatever they might +-0.77699226 the employer might +-0.95324636 or what might +-0.953217 job we might +-1.844441 what they produce +-0.86394703 they might produce +-0.9709622 exams , fit +-1.4897941 will not fit +-1.6388371 , not fit +-1.5687287 unable to fit +-0.9639046 they , nor +-0.9639046 18 , nor +-0.9639046 bartender , nor +-1.0713639 the customer nor +-0.9580587 job by itself +-0.9691483 is in itself +-1.4190361 of working itself +-0.8633952 health factor itself +-0.70521736 can manifest itself +-1.5470109 it 's external +-1.557447 to their subject +-0.54174346 the specific subject +-1.2596699 work will interfere +-1.4167826 their jobs interfere +-1.9175113 with their grades +-2.2784586 of their grades +-0.93925536 got better grades +-0.94298583 getting good grades +-0.92210096 Getting high grades +-1.4295079 and with specialization +-0.9689191 concerned is usually +-1.6352395 jobs will usually +-1.8326062 and they usually +-1.2113733 It 's usually +-2.1487837 students are usually +-1.5374187 do n't usually +-1.3949726 with some menial +-1.4513023 and doing menial +-0.86376655 's usually menial +-2.2198038 for a specialist +-1.2678866 experience is useless +-0.9631052 where I get +-0.8869368 money can get +-1.1357124 they can get +-0.8869368 people can get +-1.1167169 They can get +-1.9066446 they will get +-0.9448167 people will get +-0.95612115 graduate and get +-1.2416325 children and get +-0.95612115 feet and get +-1.6820836 having to get +-1.6978098 and to get +-2.1162646 students to get +-1.293146 time to get +-0.7340346 work to get +-1.8190119 have to get +-1.4274877 or to get +-1.8851031 able to get +-0.93061244 kids to get +-1.8805392 that students get +-0.95655394 allow students get +-0.9514322 does one get +-2.2633758 students should get +-1.6321554 if they get +-1.3686041 so they get +-0.8692438 : they get +-1.527001 when they get +-1.2582479 once they get +-0.8692438 information they get +-0.9619811 it or get +-2.0122752 They get +-0.9459612 gifted could get +-0.91987664 help them get +-1.1503136 ca n't get +-0.8609898 n't usually get +-0.9248628 n't really get +-1.138639 and ultimately get +-0.8747608 to actually get +-1.3259969 their chosen fields +-0.86394703 completely different fields +-0.9326779 thus provide relevant +-0.84507585 the various relevant +-0.7774487 in areas relevant +-2.290678 to be another +-1.6322136 to find another +-0.844898 employment hold another +-0.70534635 c Yet another +-0.8186068 be another source +-0.93476766 its our source +-1.2693326 source of temptation +-0.9446826 to sample temptation +-1.2654953 important that colleges +-2.552962 for the colleges +-0.4548204 universities and colleges +-2.326481 on their colleges +-1.2943283 Japanese colleges +-1.2691678 choosing the club +-0.9680855 planning their club +-1.7578514 away from club +-0.96443766 studies or club +-1.0773102 , whether club +-0.9321078 colleges provide club +-0.960962 part in activities +-1.2510233 engaging in activities +-1.7702909 the college activities +-1.559037 These activities +-1.2251629 the same activities +-0.6511158 from club activities +-0.6511158 provide club activities +-0.9174107 and social activities +-0.8818486 various social activities +-0.8956258 from these activities +-0.8956258 supporting these activities +-0.5406974 other genuine activities +-0.77577734 or community activities +-0.31857684 for extra-curricular activities +-0.31857684 , extra-curricular activities +-0.31857684 of extra-curricular activities +-0.5406974 Said activities +-0.5406974 have extracurricular activities +-0.70405877 and recreational activities +-0.5406974 to group activities +-1.7915295 help students socialize +-0.9676401 activities are sufficient +-0.9254347 still find sufficient +-1.4787323 college will teach +-0.94628775 correspondence will teach +-0.97023827 sufficient to teach +-1.8141911 can also teach +-2.3166041 part-time jobs teach +-1.4049523 this may teach +-1.2650744 job helps teach +-1.1915112 can ft teach +-1.5288714 to them teach +-1.7190255 people , how +-0.9668319 thought , how +-1.2646464 training in how +-1.0182469 money and how +-0.96847713 student of how +-0.9669526 teach students how +-0.9454118 teach you how +-0.9598557 shocked at how +-0.95737004 teach them how +-0.9443047 is learning how +-0.9443047 and learning how +-0.777319 of learning how +-1.0896158 can learn how +-1.3223689 to learn how +-1.0896158 they learn how +-0.9129 could understand how +-0.9227331 to Japan how +-0.7038017 they knew how +-1.115604 first hand how +-1.014906 be taught how +-2.0708947 how to behave +-2.4179401 the student away +-1.7386228 spend time away +-1.2321516 takes time away +-0.93384314 will take away +-0.9587257 take them away +-0.8631195 side takes away +-0.5413157 have moved away +-0.5413157 they possess away +-0.8170551 job will give +-1.3941864 , will give +-1.4557226 jobs will give +-0.95804554 activities and give +-1.5201763 discipline and give +-0.95804554 resumes and give +-2.103015 them to give +-1.8939391 want to give +-2.010449 part-time jobs give +-1.9523904 time jobs give +-2.2096324 that they give +-0.8177979 I fll give +-0.95512146 This would give +-1.6172948 give them unstructured +-1.694233 students with social +-2.3724518 in the social +-0.96421695 miss the social +-1.5612947 study , social +-1.435036 studying and social +-0.93300235 academic and social +-0.93300235 finances and social +-0.446374 organizational and social +-0.93300235 dorm and social +-0.93300235 drugs and social +-0.96758085 interests of social +-0.9653388 balance their social +-0.81513315 a highly social +-1.2037495 his future social +-1.713878 an important social +-0.87192225 fit into social +-0.87192225 integrated into social +-0.5399039 them unstructured social +-0.8415326 their immediate social +-0.8415326 enjoy various social +-0.94004524 ; building social +-1.3780397 must learn social +-0.70290315 and joining social +-0.93034583 fs our social +-0.88466644 or develop social +-0.70290315 develop stronger social +-0.5399039 the inherent social +-0.5399039 better communicative social +-0.70290315 and changing social +-0.5399039 an exciting social +-0.96550435 experiences with unpredictable +-0.5417812 with unpredictable results +-1.3682667 They become unwilling +-0.96516645 association with fellow +-2.0077977 with their fellow +-1.2545568 learning from fellow +-0.957627 versus my fellow +-0.9710474 classmates , teachers +-1.7390541 students and teachers +-0.96952754 students is unworthy +-2.8185043 of the finely +-0.5417812 the finely honed +-0.9700162 mind the workers +-1.5654185 majority of workers +-1.775758 full time workers +-0.9587917 experience all workers +-0.5413157 finely honed workers +-0.5413157 to unmotivated workers +-0.5413157 Restaurant workers +-2.3153777 , the demands +-2.470896 of the demands +-0.9621749 If the demands +-1.2529172 earning money demands +-1.370003 Japanese society demands +-1.3721162 I strongly +-2.155109 , I strongly +-0.7055833 commitments depends strongly +-0.86340475 I believe +-0.57916427 that I believe +-0.5539567 , I believe +-0.57916427 why I believe +-0.84421915 Overall I believe +-0.84421915 justice I believe +-1.3120548 I also believe +-0.9214782 people also believe +-1.7363645 don ft believe +-0.36003923 I strongly believe +-1.326134 I really believe +-0.7768402 I truly believe +-0.84176666 I firmly believe +-0.89953196 that it fs +-1.5010281 , it fs +-1.1795561 feel it fs +-0.96642894 And that fs +-0.9995493 a student fs +-0.78514034 the student fs +-1.6852756 college student fs +-0.7085809 that there fs +-0.8467365 and there fs +-0.7085809 then there fs +-0.7085809 when there fs +-0.49536443 for one fs +-0.6742564 in one fs +-0.59920734 of one fs +-0.80022377 to one fs +-0.6742564 on one fs +-0.3696945 during one fs +-0.71073717 In today fs +-0.3824982 in today fs +-2.0001957 There fs +-1.1961098 It fs +-0.8609031 their club fs +-0.8849932 determine someone fs +-1.0964448 , let fs +-0.8156316 in anyone fs +-0.93468004 a person fs +-0.703288 summer break fs +-0.9109734 minded individual fs +-0.54016817 What fs +-0.54016817 of Mammon fs +-0.54016817 our father fs +-0.54016817 and grandfather fs +-0.54016817 Let fs +-2.2080355 , the main +-1.9410868 of the main +-1.9221643 from the main +-1.242371 overlook the main +-1.2464976 The main +-0.7173707 are two main +-1.8460546 is a lot +-1.3992524 gain a lot +-1.461565 are a lot +-0.92141837 like a lot +-0.92141837 learn a lot +-0.92141837 costs a lot +-0.92141837 me a lot +-0.92141837 experienced a lot +-0.9533607 support this life +-1.3810753 preparation for life +-2.1623528 important for life +-1.5602671 them for life +-0.9152844 responsibilities for life +-0.9152844 bond for life +-0.9347163 of valuable life +-1.2465914 , student life +-1.2670254 on in life +-0.9025917 success in life +-1.1436332 successful in life +-0.9025917 purpose in life +-1.1436332 early in life +-0.9025917 independence in life +-0.71842355 later in life +-1.2581555 benefits and life +-0.9376362 more of life +-0.9376362 areas of life +-0.9376362 interpretation of life +-0.9376362 walks of life +-0.9376362 phase of life +-2.101918 in their life +-2.2123437 of their life +-1.4304931 and working life +-1.3372868 of working life +-1.2393456 a students life +-1.5909344 young students life +-0.62282866 from school life +-1.5758824 that college life +-1.3801777 the college life +-1.4461979 of college life +-1.4364376 their college life +-0.8687548 enjoy college life +-1.2002305 his future life +-0.91251564 in real life +-0.8913485 a useful life +-0.796652 , social life +-1.2214518 and social life +-0.796652 exciting social life +-1.2748603 student fs life +-1.2465287 one fs life +-0.8115144 anyone fs life +-0.9955104 person fs life +-1.1881185 of university life +-0.9345709 people make life +-0.8910022 their professional life +-1.4937048 to enjoy life +-0.44639388 lessons about life +-1.221366 a great life +-0.70149505 less complicated life +-1.0828398 a positive life +-1.0400496 an essential life +-1.1259946 in later life +-0.61701244 in his life +-0.6063587 the daily life +-0.6063587 enjoy daily life +-0.70149505 into everyday life +-0.53893584 work \/ life +-0.70149505 my private life +-0.70149505 of enjoying life +-0.53893584 the boring life +-0.97055566 speaking , entering +-1.2695328 prior to entering +-1.7906784 who are entering +-1.708251 They are entering +-0.9357073 graduation through entering +-0.8632945 difficult transition entering +-0.90896004 years before entering +-0.5413157 should postpone entering +-2.3699102 to the workforce +-1.6943778 into the workforce +-1.4204404 entering the workforce +-1.5905943 Having prior +-0.96057975 the workforce prior +-0.7054753 challenges exists prior +-1.7016098 Many recently-graduated +-2.1597657 students are woefully +-0.5417812 are woefully unprepared +-2.5668569 for the realities +-0.7055833 the mundane realities +-2.2235188 and the responsibilities +-2.031235 of the responsibilities +-0.9558073 have many responsibilities +-0.9690776 realities and responsibilities +-1.2578748 burden of responsibilities +-1.2578748 list of responsibilities +-0.44890815 student additional responsibilities +-0.44890815 on additional responsibilities +-0.44890815 have additional responsibilities +-0.95632863 learn important responsibilities +-0.94594836 future social responsibilities +-0.8174639 and balancing responsibilities +-0.541139 their scholastic responsibilities +-2.2536452 can be particularly +-1.7622269 student , particularly +-0.9639046 of , particularly +-1.635335 jobs , particularly +-0.96716785 true for university +-0.9684715 known a university +-1.507238 so many university +-0.9678205 College and university +-0.9635739 nature of university +-1.7009248 outside of university +-1.0741532 we leave university +-1.7864997 , then university +-0.9623895 college or university +-0.84722614 them through university +-1.051316 way through university +-1.1955805 time at university +-0.9316489 spent at university +-0.7038017 They view university +-0.54052097 companies hire university +-1.1001545 At university +-1.0884112 to cover university +-0.7038017 going onto university +-0.54052097 of contemporary university +-0.966636 preciously be used +-0.9689191 is used +-0.9425803 can become used +-0.9376271 be well used +-1.6453662 and are used +-0.8963398 had never used +-0.97014666 freedom of staying +-1.4451698 used to staying +-2.3839705 they are staying +-0.96664363 not is up +-0.9158281 from taking up +-1.0425228 to show up +-1.6475724 to take up +-1.1400146 to give up +-0.7745659 to staying up +-1.2130438 will make up +-1.4169112 to make up +-0.86025226 also entirely up +-0.7745659 to weigh up +-0.9220201 from getting up +-0.3182947 of growing up +-0.3182947 was growing up +-0.3182947 youngsters growing up +-0.5502753 students end up +-0.5502753 who end up +-0.6413871 be taken up +-0.5502753 best taken up +-0.5399919 of ending up +-0.7030314 and setting up +-0.5399919 be passed up +-0.5399919 to speed up +-0.51071906 to grow up +-0.5399919 all add up +-0.5399919 , picking up +-0.5399919 to catch up +-0.5399919 I ended up +-1.5680776 hours , late +-0.94020337 then often late +-0.96903414 sleeping in late +-2.3028445 in their late +-0.94571096 staying up late +-0.8770525 restaurants until late +-1.7417587 into the night +-0.96161425 late at night +-0.9672094 actually be sleeping +-1.4372338 studying , sleeping +-1.2651393 night , sleeping +-0.86376655 , poor sleeping +-0.9710908 late , neglecting +-0.9687079 neglecting their commitments +-1.4943943 their academic commitments +-1.6601155 , with no +-0.9459653 basis with no +-1.9577751 there is no +-0.924098 There is no +-1.268922 expectations , no +-0.96834993 then in no +-2.607604 , and no +-1.2668191 is of no +-0.88773537 and has no +-0.88773537 workplace has no +-1.3247299 and have no +-1.7804554 they have no +-1.3247299 They have no +-0.96320766 little or no +-1.6400849 we are no +-0.9239147 have had no +-0.70431596 are virtually no +-2.4359963 of their immediate +-2.3051038 The immediate +-1.0473138 will show immediate +-1.3940659 and get immediate +-0.9322244 or no immediate +-0.8453008 no immediate repercussions +-2.164092 as a result +-1.4330599 As a result +-2.540267 , the result +-2.309394 The result +-0.9303056 suffer ; result +-0.6288116 find it difficult +-0.953659 often be difficult +-0.953659 otherwise be difficult +-2.5062308 is a difficult +-1.9157985 it is difficult +-0.8624173 be often difficult +-1.1791353 is often difficult +-1.3532921 have been difficult +-1.2156701 on learning difficult +-0.7766882 is sometimes difficult +-1.2206115 student can make +-1.5499184 students can make +-1.3933605 This will make +-0.91944873 what will make +-0.91944873 rejection will make +-1.2669928 -RRB- and make +-1.1690505 graduation to make +-1.1690505 working to make +-1.6812965 students to make +-1.7137344 time to make +-1.7304938 have to make +-0.91700035 - to make +-1.5443137 opportunity to make +-1.6753925 them to make +-1.3861098 hard to make +-0.91700035 serve to make +-1.1690505 responsibility to make +-1.4464296 had to make +-1.6156784 it may make +-1.4412558 They should make +-0.9349505 education should make +-0.91316724 : To make +-0.9589776 college people make +-0.7765363 we shall make +-1.401058 the necessary lifestyle +-1.588095 student 's lifestyle +-0.88812464 and adult lifestyle +-0.5415809 a well-rounded lifestyle +-0.8186782 necessary lifestyle adjustment +-0.97038597 graduation and subsequent +-0.7055833 well c A +-2.2928863 job , undertaken +-2.6930532 I still +-2.3320706 , it still +-1.3733281 one can still +-0.9453734 Smokers can still +-1.5606817 smoking is still +-1.7339616 education and still +-2.0070126 students are still +-1.7906784 who are still +-1.3071673 job while still +-0.7721034 undertaken while still +-0.7721034 force while still +-2.0594428 can be quite +-1.704637 could be quite +-2.346868 it is quite +-1.4961355 money is quite +-1.2324202 Money is quite +-1.3664428 and become quite +-0.8447202 in itself quite +-0.9275686 am really quite +-1.9996837 may be helpful +-1.2302085 be very helpful +-1.5210211 is very helpful +-0.43259177 be quite helpful +-1.7711728 , with regard +-0.84458816 in this regard +-1.1430517 With regard +-1.5340397 a valuable exercise +-0.9694239 On a personal +-2.5137737 , the personal +-0.96152246 exercise in personal +-0.96152246 interference in personal +-0.9690776 character and personal +-1.8883668 for their personal +-0.45229593 develop their personal +-1.5833995 student 's personal +-1.6736298 for all personal +-1.0468272 in managing personal +-1.5152411 in my personal +-0.9571265 leave many recent +-1.7011673 Many recent +-0.96841514 market for graduates +-0.9701716 leaving the graduates +-0.9322329 and university graduates +-0.3422419 many recent graduates +-0.3422419 Many recent graduates +-1.1827364 the new graduates +-0.54140407 newly minted graduates +-1.1013368 recent graduates similarly +-0.8884857 not made similarly +-0.96644884 similarly have difficulty +-0.96920437 one is managing +-0.9680422 schedules , managing +-1.2649392 schedule , managing +-1.6342201 experience in managing +-0.9620837 difficulty in managing +-0.97001797 capable of managing +-1.2697304 schedule , finances +-0.7540409 managing their finances +-1.9411652 , if finances +-1.3191152 the family finances +-1.3549973 worry about finances +-0.9568751 organize my finances +-0.9398219 manage his finances +-0.54122734 end unless finances +-1.9587287 have the responsibly +-1.1339874 their finances responsibly +-1.6080055 In many western +-1.6070328 In many countries +-1.7003 Many countries +-0.54166937 many western countries +-1.741608 , this unfortunate +-1.2677655 opinion is unfortunate +-1.7417587 into the reality +-0.7055833 this unfortunate reality +-0.9410907 reality often combines +-2.814478 of the levels +-0.9223705 with high levels +-0.9691483 heavily in debt +-0.96988934 levels of debt +-0.96554226 After college debt +-0.9392116 tolerate future debt +-0.8635832 loans reduce debt +-1.7088678 students with credit +-2.2339926 and the credit +-2.025133 from the credit +-0.9651719 -LRB- on credit +-2.0581024 If credit +-0.5414925 Using credit +-1.406819 to use cards +-0.18498072 with credit cards +-0.2219529 the credit cards +-0.18498072 on credit cards +-0.18498072 If credit cards +-0.18498072 Using credit cards +-1.2558488 , student loans +-0.9697608 combination of loans +-1.4281694 rely on loans +-0.9467303 eliminating -RRB- loans +-0.91394144 upon tuition loans +-0.9215614 but your loans +-1.4248676 friends , etc. +-1.2567804 loans , etc. +-1.2567804 expenses , etc. +-0.54174346 their uniforms etc. +-1.2678738 graduates in grave +-0.9382721 grave financial peril +-0.9694723 proactive in providing +-1.8374015 not only providing +-2.1283512 at the cash +-1.268862 amounts of cash +-0.9141391 managing personal cash +-0.5415809 win substantial cash +-0.8186782 personal cash flow +-2.1315873 at the least +-1.198611 or at least +-0.9332967 provide at least +-1.5227196 gain a small +-1.2467027 least a small +-0.95874107 support a small +-0.8778965 of even small +-0.8778965 to even small +-1.2639482 need for income +-2.4936843 , the income +-1.2155398 of any income +-1.8222682 amount of income +-1.2568778 source of income +-1.5563618 managing their income +-0.86242974 requires additional income +-2.2722275 The income +-1.2447248 for an income +-0.9208348 of extra income +-0.95166093 need some income +-1.2827779 the added income +-0.81679666 much needed income +-0.54078573 a modest income +-0.9614525 income with which +-1.2515422 years , which +-1.4171757 Japan , which +-0.961228 balance , which +-0.9484841 field in which +-0.9484841 years in which +-0.9484841 weekend in which +-0.9654956 dislikes and which +-0.9619294 rent of which +-0.9619294 foremost of which +-0.95544326 real-world experience which +-1.4217638 for work which +-0.88324165 etime f which +-1.2465488 spending money which +-1.3914516 a degree which +-0.8141381 by internships which +-0.914299 are skills which +-0.914299 invaluable skills which +-1.1738577 an environment which +-0.93324715 be doing which +-1.0291562 is something which +-0.7508655 -LRB- something which +-1.201127 these activities which +-0.9545343 professional life which +-1.163163 of responsibilities which +-1.169039 of income which +-1.7485511 real world which +-0.7021345 job `` which +-0.9064653 experience against which +-0.84047526 economic system which +-0.89875686 5-day week which +-0.53937554 bare facts which +-0.53937554 for travel which +-0.53937554 enormous sum which +-0.53937554 other paths which +-0.94261456 passive smoking which +-0.9709693 which to counter +-0.94965845 counter such debts +-2.3998122 , I largely +-1.9602879 This is largely +-1.8452595 which is largely +-1.8195953 , having been +-0.94858307 which has been +-1.4907516 students have been +-0.88019586 -RRB- have been +-1.5289742 they have been +-0.88019586 There have been +-1.285577 would have been +-1.2535058 have never been +-2.0527523 I myself +-2.1590135 college student myself +-1.8677258 I am +-0.6991601 reasons I am +-1.2387333 , I am +-1.301879 and I am +-1.0867988 reason I am +-0.54174346 , gI am +-0.95829105 was more aware +-0.96005267 make people aware +-1.4527256 I am aware +-1.5162457 so many pressures +-1.1623884 the financial pressures +-0.8542603 these financial pressures +-0.9470824 changing social pressures +-1.0065018 financial pressures generated +-0.96367705 efficiently with living +-1.5623999 earn a living +-1.2685182 supplies , living +-1.4373401 fees and living +-1.922509 cost of living +-1.5416551 costs of living +-0.96953917 accustomed to living +-1.2624636 support their living +-0.9627984 rent or living +-1.9517194 student fs living +-1.151753 are still living +-1.3301641 for those living +-1.2409159 all my living +-0.42338336 and daily living +-0.77577734 and general living +-0.9141391 all personal expenses +-0.24906407 with living expenses +-0.24906407 , living expenses +-0.24906407 and living expenses +-0.24906407 their living expenses +-0.24906407 or living expenses +-0.24906407 fs living expenses +-0.24906407 my living expenses +-0.16577952 daily living expenses +-0.24906407 general living expenses +-1.3885677 , these expenses +-1.0921655 to cover expenses +-2.035174 is that these +-1.9145542 Firstly , these +-1.2513483 Often , these +-0.9611286 long , these +-2.015777 In these +-0.9483753 mentioned in these +-0.9483753 explain in these +-0.9483753 himself in these +-0.96668637 first of these +-0.9676804 contrast to these +-0.8200267 For these +-1.736405 away from these +-2.47735 college students these +-1.5103812 Many students these +-0.9619883 less on these +-0.96236163 meaning have these +-1.1744753 seems like these +-2.1023088 to do these +-0.88338435 of done these +-0.88304746 also teach these +-0.9310726 much during these +-0.53928757 of easing these +-0.53928757 of demonstrating these +-1.0402863 to improve these +-1.2662196 to explore these +-0.77335775 must weigh these +-0.87392604 education costs these +-0.8139725 to practice these +-1.2382298 to appreciate these +-1.3690586 All these +-0.8139725 can pass these +-0.53928757 Sometimes these +-0.7020066 of supporting these +-0.53928757 studies across these +-0.53928757 Between these +-0.96771944 expenses are met +-1.2696514 through a combination +-1.2709996 loans , grants +-0.96412337 time with family +-2.1632152 having a family +-2.1483035 of a family +-0.9575682 supporting a family +-2.0281515 of the family +-2.2750762 on the family +-2.6148129 , and family +-0.9208441 and their family +-1.7805 from their family +-0.9634124 grants or family +-0.9381647 entire future family +-0.9132556 typical Japanese family +-1.1368991 a small family +-1.1942613 his own family +-0.8436551 Not every family +-0.9513582 pass these savings +-0.9254347 or family savings +-0.86400133 they usually restrict +-1.4264265 part-time job during +-2.2259097 time job during +-1.4416637 all , during +-0.9541948 only study during +-1.5065908 that working during +-0.9339278 anybody working during +-1.8232969 spend time during +-1.5444248 time work during +-0.9322306 manifest themselves during +-0.816297 's lifestyle during +-1.4665303 too much during +-0.901258 only worked during +-0.92882323 so hard during +-0.54052097 possible wastes during +-0.84018075 important element during +-0.7038017 and unproductive during +-1.0442077 primary goal during +-0.54052097 game playing during +-1.2695354 during a crucial +-0.7775762 and developing crucial +-1.5806134 , this period +-1.4776752 of this period +-2.5231664 is a period +-0.70534635 a crucial period +-0.77729654 this critical period +-2.6242807 of the adult +-1.6449101 all the adult +-1.2680655 independent and adult +-2.3028445 in their adult +-0.6859247 as an adult +-0.8771952 an independent adult +-0.54140407 into functioning adult +-1.6531138 necessary for development +-2.4378812 in the development +-2.2339926 and the development +-1.2684236 growth and development +-0.88792825 their adult development +-0.5414925 the all-round development +-1.5587846 working for earning +-0.96644497 time be earning +-1.4370177 realize that earning +-2.2335522 job , earning +-2.0925746 students , earning +-2.644908 , and earning +-0.9575156 while also earning +-1.8795137 who are earning +-2.5330536 is a constructive +-0.95745695 feel this way +-1.4461042 as a way +-1.5253305 a valuable way +-2.4946861 in the way +-0.9669524 work their way +-2.1798193 , or way +-0.9305789 in no way +-0.54078573 a constructive way +-1.1934878 your own way +-1.2303431 a great way +-1.2303431 the best way +-0.955937 worked my way +-0.9388072 pay his way +-1.566702 way of easing +-1.6733141 , by allowing +-0.970158 pressures and allowing +-0.7054753 , whilst allowing +-1.3835456 job can lead +-1.1245906 This can lead +-0.8352566 drinking can lead +-0.8352566 age can lead +-0.8352566 professor can lead +-0.8352566 minds can lead +-0.9707589 turn , lead +-2.1510117 them to lead +-1.1996679 job may lead +-0.9989068 and may lead +-0.81373477 time may lead +-0.81373477 much may lead +-0.8447202 may ultimately lead +-1.2483399 to an independent +-1.1968005 and become independent +-1.117306 to become independent +-1.245441 a more independent +-0.8834048 become financially independent +-0.7349471 Being financially independent +-0.54174346 a distracter Secondly +-1.6677687 , with regards +-0.9474329 held with regards +-0.93832636 to employment prospects +-1.946188 student is rarely +-0.96240604 income is rarely +-2.387447 they are rarely +-1.3993495 the only factor +-1.2427977 Another important factor +-0.8772552 The second factor +-1.0470304 The last factor +-0.8446352 or negative factor +-1.6759878 the health factor +-2.3548498 for the majority +-1.2536416 For the majority +-1.2536416 probably the majority +-1.3818003 a large majority +-1.2687647 secure a graduate +-1.5662944 majority of graduate +-2.1783223 time to graduate +-1.6845262 when they graduate +-0.7231363 after they graduate +-1.3685749 once they graduate +-0.9109669 until they graduate +-1.4513702 when we graduate +-1.149322 When we graduate +-1.6629506 jobs , professional +-0.9431696 gain any professional +-1.8556898 are in professional +-2.293868 to their professional +-1.8600217 to become professional +-1.6861509 the future professional +-0.541139 bad qualified professional +-0.541139 an educated professional +-0.8872421 can develop professional +-1.2695947 professional and transferable +-1.9229413 that the ability +-2.1553738 to the ability +-1.494154 as the ability +-1.8022914 have the ability +-1.2313393 offer the ability +-1.2705404 age , ability +-0.96840984 assessing their ability +-1.530767 the academic ability +-2.401189 able to communicate +-1.5571167 ability to communicate +-0.96412337 communicate with customers +-1.2644417 value for customers +-2.5949848 of the customers +-0.9660607 form the customers +-0.9567242 -RRB- of customers +-1.679137 number of customers +-0.9567242 myriad of customers +-0.91705203 Exposing restaurant customers +-0.96440566 severe on customers +-2.0443344 If customers +-0.902425 would give customers +-0.70444465 and handling customers +-1.0456167 of non-smoking customers +-0.5409623 , selfish customers +-2.2811744 to be responsible +-1.7093492 become a responsible +-1.5465044 becoming a responsible +-1.2677076 mature and responsible +-1.4084086 of being responsible +-1.365329 and become responsible +-0.70495963 the institutions responsible +-1.2085184 Being responsible +-0.96934706 work is almost +-1.2638347 , are almost +-0.8184666 been given almost +-0.83090454 to academic achievement +-0.83090454 as academic achievement +-1.566702 way of demonstrating +-0.9514415 demonstrating these attributes +-1.5237232 for many aspects +-1.3450549 many other aspects +-1.7614259 of what aspects +-1.8222228 and to enjoy +-0.95223993 also to enjoy +-1.3394618 time to enjoy +-1.0348384 money to enjoy +-1.4993587 allowed to enjoy +-0.96676177 employment they enjoy +-0.95902747 let them enjoy +-0.82289517 to really enjoy +-1.0130177 never really enjoy +-0.7771444 to truly enjoy +-0.9709693 what to expect +-1.726069 jobs that once +-2.1129537 money and once +-1.2100018 workplace relationships once +-1.2445464 working life once +-1.2815046 that were once +-0.8769452 U.S. universities once +-0.7769962 schedule begins once +-1.6817615 on a wider +-0.9587978 meet a wider +-0.9587978 opens a wider +-1.018674 a wider scale +-1.6619699 are a necessity +-0.97025436 matter of necessity +-1.2492541 in an economical +-1.728523 and a sense +-2.4300733 have a sense +-1.9801816 get a sense +-1.653702 a better sense +-0.5414925 an economical sense +-0.81838614 a growing sense +-0.5414925 a false sense +-1.7011673 Many businesses +-0.7775762 whilst allowing businesses +-2.1008365 do not require +-0.7055833 Many businesses require +-1.2313644 a work force +-0.40850976 the work force +-1.6097654 job may force +-1.2409387 It would force +-2.296603 to be flexible +-2.0531116 that is flexible +-2.401189 able to fill +-1.841641 difficult to fill +-1.5458524 as I would +-0.44897684 then I would +-0.9545271 certainly this would +-1.9500425 , it would +-1.5102077 and it would +-1.6901476 think it would +-1.7010955 job that would +-0.9422474 positions that would +-0.9422474 system that would +-2.0711503 a job would +-2.2051377 time job would +-2.3483686 the student would +-1.2601634 nature , would +-0.9656249 approach , would +-2.5060658 , and would +-1.2944659 of study would +-1.6824313 to study would +-1.8645288 , there would +-1.9249262 , students would +-1.5992402 most students would +-0.94027734 more students would +-0.9389061 job then would +-1.0920187 that they would +-1.6954954 , they would +-1.2809557 skills they would +-0.908556 materials they would +-2.09173 This would +-1.6408627 It would +-1.5164719 fs life would +-0.89241326 about finances would +-1.2517449 That would +-0.7022626 yes-or-no answer would +-0.83807516 convenience store would +-1.3936552 Some would +-0.8738569 for books would +-1.2312639 Students would +-1.1360545 or she would +-1.5671607 , we would +-0.8406513 best choice would +-1.2987367 in restaurants would +-1.0395195 at restaurants would +-1.6734388 banning smoking would +-1.6746192 a full-time member +-0.9471815 valuable -RRB- member +-0.47024587 , contributing member +-0.47024587 and contributing member +-1.5677278 member of staff +-2.0686665 In contrast +-0.94298583 many good arguments +-0.9509998 to these arguments +-1.1187351 to both arguments +-0.70534635 two separate arguments +-1.5443585 it can however +-1.2375193 statement , however +-1.3967925 is , however +-0.95398486 stress , however +-1.8244475 college , however +-0.95398486 believe , however +-0.9342479 important reason however +-2.1208038 their studies however +-0.92993075 not ; however +-0.81796503 these arguments however +-1.2980121 student to cope +-0.96602863 it depends +-0.7055833 academic commitments depends +-2.540267 , the situation +-2.145156 This situation +-0.9141391 's personal situation +-1.7006195 their own situation +-0.9447781 is largely determined +-1.3703371 their financial condition +-0.96956015 over a year +-2.448238 for the year +-1.9211686 by the year +-1.4361105 , their year +-1.2365392 the school year +-0.84443927 my final year +-0.8441873 people every year +-0.54122734 the sophomore year +-0.54122734 the 4th year +-2.3040445 and the type +-1.0960984 , any type +-0.87467146 consider any type +-1.2383901 during school builds +-1.7436646 skills , encourages +-1.2076609 a financial stake +-0.7055833 financial `` stake +-1.8797526 pay for education +-2.4779055 for the education +-1.8270977 part of education +-1.122917 cost of education +-1.5261381 years of education +-0.9390712 process of education +-1.4546181 form of education +-1.9779743 in their education +-0.89199555 of their education +-1.2165031 support their education +-0.9599132 leap from education +-0.8157197 good quality education +-1.6563098 high school education +-0.9239322 get an education +-0.9239322 out an education +-0.89484817 college education +-0.6652713 a college education +-0.6597179 their college education +-1.1302321 if college education +-1.5714238 student 's education +-1.7546544 a good education +-1.7897477 one fs education +-1.4761963 as much education +-1.3975381 for my education +-1.0013055 of higher education +-0.7030314 about advanced education +-0.94026667 and proper education +-0.930557 of our education +-0.8152993 in further education +-0.6074765 is tertiary education +-0.6074765 by tertiary education +-0.92707115 improving health education +-0.5399919 an incomplete education +-0.95958614 Doing this prepares +-2.6843307 , and prepares +-2.1969624 for a world +-1.9074329 in the world +-1.9538796 of the world +-0.9279369 experience the world +-1.71755 from the world +-0.9279369 see the world +-0.9279369 explore the world +-1.3281056 about the world +-0.9279369 hold the world +-0.9279369 around the world +-1.2007453 the working world +-0.934454 tedious working world +-1.292153 the work world +-0.53275853 the academic world +-0.7938381 for real world +-0.15899517 the real world +-1.4017866 today fs world +-0.43490395 the adult world +-0.84088486 the ereal world +-0.540874 a fascinating world +-0.540874 at Disney world +-0.540874 the greal world +-2.0943336 they will encounter +-0.8186782 There fre countless +-2.2551448 can be acquired +-1.2248157 The most readily +-0.5417812 most readily apparent +-1.2592605 do with balancing +-0.9708606 handling , balancing +-0.9692625 gained in balancing +-2.6689322 , and balancing +-0.9255064 Skills like prioritization +-0.9710908 prioritization , multitasking +-0.9708606 positions , finding +-1.4405383 helpful in finding +-0.97001797 joys of finding +-2.0622277 student to finding +-1.9676176 important to finding +-1.6107774 necessary for success +-0.9594425 tool for success +-0.9698608 ultimately the success +-0.96880597 element in success +-0.7766882 with considerable success +-1.2030525 their academic success +-0.82991654 on academic success +-1.0470349 to finding success +-0.9315638 fs own success +-0.54122734 to achieve success +-0.96367705 wrong with going +-0.9670334 sleeping , going +-0.9670334 resume , going +-2.098383 are not going +-1.6301045 life and going +-1.2563437 up and going +-0.96398264 open as going +-1.8416505 their time going +-1.7336475 spend time going +-1.3640735 and then going +-0.9198455 particular : going +-0.9627984 homework or going +-0.9654439 offers are going +-0.95399743 bubble when going +-0.7422465 working while going +-1.2892576 is always going +-1.0154324 Balancing going +-1.8729801 the job provides +-0.95888126 hands-on experience provides +-1.6211587 that working provides +-1.4072039 job also provides +-1.7146164 to college provides +-0.9088795 it still provides +-2.2188299 for a perfect +-2.0865924 is the perfect +-0.96742356 provides the perfect +-0.94065106 students valuable training +-2.1250482 is the training +-0.9421236 disagree because training +-0.9442717 the perfect training +-0.81864053 perfect training ground +-0.9446826 a middle ground +-1.4812503 This will improve +-0.9470251 restaurants will improve +-1.5565478 or to improve +-0.96771485 ground to improve +-1.3893086 should help improve +-2.3479266 with the burden +-1.3533711 the financial burden +-0.9442717 and proper burden +-0.5415809 an unnecessary burden +-1.4411428 education is supported +-0.8969924 perhaps fully supported +-0.9705882 daughter to entirely +-1.8324255 is also entirely +-0.70521736 is supported entirely +-0.8421199 or herself entirely +-1.1106899 up smoking entirely +-0.883376 quit smoking entirely +-2.3556416 with the consequence +-2.5504236 , the concept +-0.4452187 have little concept +-2.6428561 of the actual +-0.9672942 so the actual +-0.97014666 background of actual +-2.313727 The actual +-2.0956888 for the cost +-1.8810079 that the cost +-2.0764477 to the cost +-1.9946803 on the cost +-1.4741518 appreciate the cost +-0.94493854 With the cost +-2.3008554 The cost +-1.2333771 the true cost +-1.0055709 the actual cost +-1.1641253 the full cost +-0.81796503 their labor cost +-2.509021 time job `` +-1.2076609 a financial `` +-0.9621588 start working '' +-0.7054753 `` stake '' +-0.87772065 which means '' +-1.4082936 not a place +-1.9420761 to a place +-0.9580987 demand a place +-0.96792346 securing their place +-0.9616804 Governments should place +-1.6580338 the first place +-1.536698 do n't place +-0.8177979 find another place +-0.8631195 learning takes place +-1.2414992 student the importance +-2.2004743 , the importance +-0.45172948 increase the importance +-0.95605206 place the importance +-2.3008554 The importance +-0.93201834 of no importance +-0.81796503 determine its importance +-0.7050885 of utmost importance +-0.54140407 of paramount importance +-1.4852661 things that were +-0.9445174 self that were +-0.9445174 goals that were +-1.7341659 while they were +-1.626673 that you were +-2.0092113 There were +-0.9442683 my parents were +-1.2850596 credit cards were +-1.6628948 they would were +-1.016751 Others were +-0.704702 75 % were +-1.1939255 for classes coming +-1.2616688 income and out +-1.5546552 the most out +-2.1460001 , or out +-1.1664304 by taking out +-1.2296374 will get out +-1.1813774 time going out +-0.5398157 classes coming out +-0.7808267 and go out +-1.1948891 to go out +-0.859884 working far out +-1.0015004 to try out +-0.702775 to round out +-0.702775 to test out +-0.3414728 , hang out +-0.3414728 simply hang out +-0.5398157 and figuring out +-0.5398157 be pointed out +-0.702775 and carry out +-0.5398157 wont burn out +-0.5398157 to move out +-0.5398157 to strike out +-0.5398157 generally fresh out +-0.5398157 as filling out +-0.5398157 they shell out +-0.702775 brother dropped out +-0.5398157 did drop out +-1.3550227 of their own +-1.4718647 to their own +-1.6967505 on their own +-0.85092443 pay their own +-0.85092443 make their own +-0.85092443 earning their own +-0.85092443 balancing their own +-0.85092443 getting their own +-0.85092443 given their own +-0.85092443 regarding their own +-0.8052187 pay your own +-0.8052187 earning your own +-1.8126075 one fs own +-0.95725083 put my own +-1.1998152 on our own +-1.0764849 on his own +-0.862786 starting his own +-1.7034411 their own pockets +-1.942977 This is where +-1.2497855 That is where +-1.2538766 So , where +-0.9624228 exists , where +-0.9624228 carrels , where +-1.9307867 work and where +-1.8555001 idea of where +-1.2672161 close to where +-0.8161305 offer internships where +-0.92222655 this environment where +-0.9546953 that fs where +-0.59713745 a place where +-0.54043275 unique environments where +-1.4750497 their lives where +-0.8859583 the point where +-0.54043275 some departments where +-0.7036732 hour meetings where +-0.8755528 always choose where +-0.54043275 and colleagues where +-0.8161305 are places where +-0.9650865 experience can contribute +-2.5878587 students to contribute +-2.3999302 able to contribute +-1.5598985 understand that giving +-1.2707431 lectures , giving +-0.95838577 and by giving +-1.8235195 and the value +-1.9630382 of the value +-1.1905922 even the value +-1.3305935 understanding the value +-1.3305935 about the value +-0.13041002 learn the value +-0.30894706 understand the value +-1.4222126 appreciate the value +-0.9289252 taught the value +-0.8633952 some educational value +-1.2338123 the true value +-1.5514445 the real value +-0.79486597 their real value +-0.5414925 it promotes value +-1.3590684 students often find +-2.2158425 students to find +-1.8773354 time to find +-1.7683856 important to find +-1.6764584 difficult to find +-0.31948218 possible to find +-1.2143741 tend to find +-0.9417827 ourselves to find +-1.5188104 Many students find +-0.95767075 If students find +-2.279232 students should find +-2.19944 , or find +-1.2194401 may even find +-0.9086736 and still find +-0.95902765 ease this transition +-1.5597583 preparation for transition +-2.4317107 , the transition +-1.2628484 ease the transition +-1.8624588 difficult to transition +-0.9037439 a difficult transition +-2.797555 of the campus +-1.2545568 banned from campus +-0.9210864 time on campus +-0.9210864 classes on campus +-0.9210864 groups on campus +-1.7765818 the college campus +-1.7831396 College campuses +-0.96771944 campuses are unique +-0.5417812 are unique environments +-1.2704892 where the passion +-0.96308273 comparison with learning +-0.9582207 passion for learning +-1.2456934 essential for learning +-0.9601871 consider is learning +-1.8285966 which is learning +-2.5570314 of the learning +-2.255887 on the learning +-1.8969117 example , learning +-1.6941365 people , learning +-1.2536815 customers , learning +-2.0396304 money and learning +-1.4269952 ; and learning +-1.987281 part of learning +-0.9605465 evolved from learning +-0.96344465 Such as learning +-0.94567895 possible on learning +-1.3741469 focused on learning +-1.4269013 available time learning +-2.3259914 they are learning +-1.3475969 have been learning +-0.8853834 create great learning +-1.0023263 of higher learning +-0.8857858 what point learning +-0.8398291 to mention learning +-0.8398291 Whether learning +-1.2695947 learning and discovery +-0.96771944 discovery are cultivated +-1.5691162 only to continue +-0.86392206 that colleges continue +-0.9709693 continue to maintain +-1.3747886 and can explore +-1.3747886 one can explore +-1.4071077 begin to explore +-1.9875093 them to explore +-1.4071077 begins to explore +-0.9576777 whom to explore +-1.5875494 To explore +-1.1014369 can explore talents +-1.5667721 In the interests +-0.95657855 fs many interests +-0.96979755 talents and interests +-0.92481416 discover new interests +-0.70521736 his changing interests +-2.108051 , a full +-1.2449429 taking a full +-1.9765332 get a full +-2.4757087 to the full +-1.4320285 pay the full +-1.713674 are many full +-0.9693753 challenges of full +-0.9675995 reach their full +-1.2517172 while working full +-2.2952802 to work full +-0.9651177 post college full +-1.232292 they enter full +-1.1649166 the full extent +-0.7775762 a certain extent +-0.9697437 have , without +-0.95642704 get by without +-0.94173986 smoking -LRB- without +-1.421288 a career without +-0.9226517 smoke-free environment without +-0.77562577 's abilities without +-0.93438506 stressful enough without +-0.89472914 they earn without +-1.3491616 in society without +-0.92302394 small family without +-1.6327894 of education without +-0.93611366 several things without +-1.0971998 for us without +-0.8615437 well live without +-0.5406091 nice meal without +-1.641063 and not worrying +-0.92542505 abilities without worrying +-1.2497855 experience is about +-0.96032685 learn is about +-1.6831667 the part about +-0.8865227 invaluable knowledge about +-0.95706904 be all about +-1.5171707 teaches them about +-1.2128345 on learning about +-0.34177178 not worrying about +-0.34177178 without worrying about +-0.89167774 and learn about +-1.5755811 to learn about +-0.6845026 valuable lessons about +-0.6845026 practical lessons about +-0.94247115 cared much about +-0.81645405 valuable insight about +-0.7036732 be brought about +-0.7753227 have studied about +-0.9195662 and right about +-0.7036732 of thinking about +-0.54043275 is worried about +-0.23553617 to worry about +-1.2591032 work can translate +-1.4147173 experience may translate +-0.96587366 job not directly +-0.7054753 can translate directly +-0.54166937 particularly ones directly +-1.5671173 into a salable +-0.5417812 a salable product +-1.0928776 , particularly ones +-0.94100183 jobs often expose +-2.0002522 important to expose +-2.7109714 I partially +-0.9519091 if only partially +-0.96873605 world that awaits +-2.3173888 The answer +-0.54174346 , yes-or-no answer +-1.2483442 to this question +-2.027929 job in question +-0.96952754 question is subjective +-1.8089019 A blanket +-0.9710908 blanket , yes-or-no +-1.7433472 not to overlook +-0.95589554 answer would overlook +-1.7965453 statement for various +-1.2694774 overlook the various +-1.52377 teaches them various +-1.5154113 to enjoy various +-0.9429879 carry out various +-1.5687699 These factors +-0.7774487 various relevant factors +-0.89751804 of these factors +-0.89751804 weigh these factors +-1.2586737 working with different +-1.8578542 at a different +-0.8632097 in completely different +-0.7431135 are very different +-1.2605671 , have different +-1.7092757 that are different +-0.955036 Japan are different +-2.4471095 in a limited +-1.8517852 time is limited +-2.3479266 with the limited +-2.1229475 are not limited +-1.1528965 , school supplies +-0.90788525 afford school supplies +-2.5528169 , the probable +-0.970327 carefully the impact +-0.5414925 the probable impact +-0.8774103 , beneficial impact +-0.6090649 decidedly negative impact +-0.6090649 negligible negative impact +-0.5414925 will negatively impact +-1.7131681 time and energy +-0.9657898 momentum and energy +-1.5604138 all their energy +-0.70563835 the resulting reduction +-2.4458141 of their effectiveness +-1.97698 student fs effectiveness +-1.5670048 studying and completing +-2.4394312 of their assignments +-0.96484846 time or assignments +-0.5415809 and completing assignments +-0.92209035 our class assignments +-1.5687809 as the relevance +-0.5417812 Considerations pertaining +-0.96771944 factors are fairly +-0.54174346 are fairly straightforward +-0.54174346 is pretty straightforward +-0.96829027 made for either +-0.9687765 hour is either +-2.4179401 the student either +-2.133676 related to either +-1.2536224 money from either +-0.9506083 not help either +-1.1628406 was always either +-1.6473862 because it requires +-2.094728 It requires +-0.87743556 student either requires +-0.5415809 Training requires +-2.3723423 , I support +-0.96368796 parents can support +-0.967438 reasons that support +-1.9672656 money , support +-1.7238734 education , support +-1.4966071 reasons to support +-2.1043668 able to support +-1.8070188 need to support +-0.9514575 income to support +-1.2326771 afford to support +-1.6324286 parents or support +-1.5203718 I do support +-0.89502466 or fully support +-0.84088486 I firmly support +-0.7760808 I shall support +-0.540874 I emphatically support +-0.540874 to adequately support +-2.347839 a job rises +-0.97038597 rises and falls +-0.96952105 falls in relation +-1.4290076 is it second +-0.9696964 inhaling a second +-2.6002233 to the second +-1.2699327 focus , second +-2.4829874 , and second +-1.2587898 independent and second +-2.2966485 The second +-0.89201117 My second +-1.5454717 because I never +-1.2582055 there will never +-1.3263907 that have never +-0.92725366 college have never +-1.7850908 they have never +-0.8632945 I say never +-1.3802235 I could never +-1.2397053 , would never +-0.92471147 but had never +-1.3864597 restaurant is too +-1.9815063 It is too +-1.3864597 education is too +-0.9671342 important , too +-0.9671342 environment , too +-0.964117 maybe not too +-1.6322821 taking on too +-0.8950865 smells bad too +-1.2037125 job are too +-1.5146104 and are too +-1.9578663 There are too +-2.0015206 their parents too +-0.9577573 it all too +-2.0398405 If too +-0.9567826 enjoy life too +-1.204543 place where too +-1.5182676 But too +-1.1157391 is perhaps too +-1.9599059 be a great +-2.249643 is a great +-0.95823145 experience a great +-2.6580505 it is great +-0.9701716 digesting the great +-1.2114671 can make great +-1.3338006 are too great +-0.54140407 can create great +-0.9444466 great -LRB- relative +-1.4058527 more important relative +-1.1476102 In addition +-0.9395331 ; in addition +-1.2101696 classes in addition +-1.456134 them in addition +-0.9395331 about in addition +-2.3159316 to their scholastic +-1.886639 should not keep +-0.9583013 fs also keep +-1.2127582 they must keep +-2.0850334 is the last +-0.9672942 until the last +-1.8674259 that will last +-1.8494825 The last +-0.9281723 a less complicated +-1.5695815 the most complicated +-0.9703562 requires a determination +-1.2350495 the true determination +-0.9671011 not they stand +-0.93476766 make our stand +-1.438136 to that particular +-1.5480503 from a particular +-0.9655088 when a particular +-0.9441276 entering any particular +-0.9692625 skill in particular +-1.6777825 in a convenience +-0.96161425 example at convenience +-0.3807505 a convenience store +-1.6633965 smoke , someone +-0.97023827 value to someone +-2.051164 If someone +-0.9136732 you go someone +-0.8176309 to serve someone +-0.8176309 sometimes determine someone +-0.8176309 preferred over someone +-1.0051168 the efforts someone +-0.9696964 studied a business +-0.94005394 make valuable business +-0.97055566 problems , business +-0.94945806 someone studying business +-0.9642324 house or business +-0.93864715 knowledge about business +-0.70495963 restaurants ' business +-1.5704347 needs to weigh +-1.8913591 they should weigh +-1.3614974 student must weigh +-0.9656403 good and decide +-0.9656403 factors and decide +-1.5687287 experience to decide +-2.0076985 when they decide +-0.9707589 workplace , based +-1.6387053 or not based +-0.70521736 that theory based +-0.8181321 is purely based +-0.5414925 of formula based +-2.3714485 on the particulars +-1.765651 I feel +-1.3498288 reasons I feel +-1.7713268 , I feel +-1.1296666 reason I feel +-0.96997774 opinion and feel +-1.2489028 many people feel +-2.1229439 that it allows +-1.5600914 , it allows +-1.2503052 Working part-time allows +-2.4240267 part-time job allows +-2.2440126 time job allows +-2.140444 This allows +-2.0917118 It allows +-0.7050885 a wage allows +-1.2585078 we will start +-2.4860063 students to start +-1.8083568 money to start +-0.96261126 '' to start +-1.886059 that they start +-1.7873074 when they start +-1.517268 before they start +-1.6471473 when we start +-0.7050885 a head start +-0.54140407 a smooth start +-0.8308522 living ; building +-0.8308522 basis ; building +-1.2702949 to start building +-0.97041446 building a strong +-1.5738872 , work ethic +-0.9509333 strong work ethic +-0.970309 expenditure and ease +-1.4536839 can help ease +-1.5831437 to help ease +-0.9693766 major in subjects +-2.3299065 on their subjects +-0.93046504 only academic subjects +-1.2234472 they can learn +-0.94442195 you can learn +-1.7066113 students will learn +-1.9111121 they will learn +-0.9698452 consequently , learn +-0.96817935 quicker and learn +-1.6284102 student to learn +-1.954911 students to learn +-1.2774061 begin to learn +-1.7172894 able to learn +-1.5689906 need to learn +-1.5729257 chance to learn +-1.506639 opportunity to learn +-1.3574245 ability to learn +-1.3574245 start to learn +-1.1514233 lessons to learn +-0.9070466 wisely to learn +-0.9070466 children to learn +-0.9070466 finished to learn +-0.9070466 women to learn +-2.2131567 , students learn +-1.2314063 will also learn +-0.8699339 They also learn +-0.8699339 Students also learn +-1.5302689 They should learn +-1.8799226 that they learn +-1.3376812 skills they learn +-1.3376812 Maybe they learn +-1.3678479 will better learn +-0.76420933 also must learn +-0.9251819 Students must learn +-0.76420933 she must learn +-1.1847069 ft really learn +-1.6788732 Students learn +-1.6401705 when we learn +-1.2111412 many valuable lessons +-1.2058519 and financial lessons +-0.8443649 valuable practical lessons +-0.95659256 these important lessons +-0.8771001 and beneficial lessons +-1.6088461 of those lessons +-1.0172796 to attend lessons +-1.5537521 to be successful +-2.1461332 for a successful +-1.5485668 or a successful +-1.0924798 to build successful +-1.2704499 to start saving +-0.9698327 In a free +-0.8740446 that any free +-0.8740446 utilize any free +-1.2472482 spend their free +-0.95902205 enjoy their free +-0.9558945 depth study free +-0.77699226 have 2 free +-0.54140407 that single free +-1.7115754 students with structure +-2.0708947 how to divide +-1.6571755 jobs in areas +-1.8292053 in all areas +-0.7774487 from country areas +-0.9650539 references or contacts +-0.89685124 develop professional contacts +-0.87772065 valuable business contacts +-2.512545 it is best +-1.6230333 studying is best +-2.0157726 that the best +-2.0254824 is the best +-0.9620223 want the best +-1.2656678 is their best +-1.8479898 The best +-2.4372787 the student intends +-0.9709693 intends to pursue +-2.2551448 can be exhausting +-1.9720926 However , let +-1.2645392 customers , let +-0.9653459 must not let +-1.7994487 studies and let +-1.9131018 chance to let +-1.6214783 we should let +-1.7023096 And let +-0.9657641 where it becomes +-2.7413654 part-time job becomes +-0.9603899 Time becomes +-0.94074243 the smoke becomes +-0.8186782 job becomes time-consuming +-1.6605414 College is stressful +-0.9652262 time-consuming or stressful +-0.96877724 keep their priorities +-0.5417812 their priorities straight +-0.96844935 workers that quit +-1.2444222 decide to quit +-0.9575645 harder to quit +-0.9575645 asked to quit +-1.406789 forced to quit +-1.8904092 they should quit +-0.84495735 most likely quit +-0.96781695 saying that rather +-2.5020983 is a rather +-1.3835106 time studying rather +-1.4135731 their jobs rather +-0.8871436 full potential rather +-1.1990498 for themselves rather +-0.93737245 in need rather +-1.0461817 in mind rather +-0.704702 poker players rather +-1.7381359 education , than +-1.185228 have less than +-1.6348169 during college than +-0.70418733 more competitive than +-1.4222882 a career than +-1.1458826 are more than +-0.56814647 spend more than +-0.819805 But more than +-0.819805 concentration more than +-1.401847 more important than +-0.11409575 that rather than +-0.11409575 studying rather than +-0.11409575 jobs rather than +-0.11409575 potential rather than +-0.11409575 themselves rather than +-0.11409575 need rather than +-0.11409575 mind rather than +-0.11409575 players rather than +-0.54078573 or specialisation than +-1.0759023 health problems than +-0.54078573 sadder story than +-0.54078573 larger possibility than +-1.6095899 in restaurants than +-2.661735 students to sacrifice +-1.6721942 rather than sacrifice +-0.97076845 Despite the risks +-0.93066055 proven health risks +-0.94384694 co-workers -LRB- though +-0.9448806 -LRB- even though +-0.8965667 of others though +-0.70521736 the risks though +-1.3940145 Even though +-1.8836921 may be beneficial +-1.9193258 would be beneficial +-2.7282667 part-time job beneficial +-2.0118299 student is beneficial +-0.97055566 long-term , beneficial +-0.9694374 useful and beneficial +-1.9912884 , but beneficial +-0.7768402 be personally beneficial +-1.9153736 is to imply +-0.96518874 some it just +-0.968634 study is just +-1.8753234 is not just +-1.791681 should not just +-0.9665557 reports are just +-1.5151035 It fs just +-0.917614 many responsibilities just +-0.9211263 some income just +-0.54122734 have tried just +-1.0690506 little extra pocket +-0.80584294 gain extra pocket +-0.9652262 book or pocket +-0.92226976 spending : Some +-1.7405045 however , possible +-0.96229285 direct from possible +-0.9211172 focus as possible +-1.1764336 distractions as possible +-1.1764336 soon as possible +-1.6078688 at all possible +-2.0534644 If possible +-1.2096395 is quite possible +-1.2927344 not always possible +-1.7418288 pay for those +-0.9458184 And for those +-0.9458184 smoke for those +-2.607604 , and those +-0.9323309 lives of those +-1.4329062 members of those +-1.1968336 right of those +-1.1968336 One of those +-1.1968336 efforts of those +-0.9323309 concerns of those +-0.96667266 comparison to those +-0.96667266 compared to those +-1.9379759 For those +-1.2837069 for example those +-0.9614982 contact from those +-1.8612589 to help those +-0.8868216 are earning those +-0.9389058 specialisation than those +-0.7760808 significantly behind those +-0.540874 -LRB- Beware those +-0.970327 towards the final +-0.96824765 studying their final +-2.1427934 This final +-1.5179671 of my final +-1.3643279 My final +-1.6465164 money for years +-0.9016782 waste several years +-2.4792047 , the years +-1.2246704 The college years +-1.2246704 fs college years +-0.60818696 the final years +-0.60818696 their final years +-0.7038017 these precious years +-1.130547 in later years +-0.861359 last few years +-0.4694238 their four years +-0.4694238 these four years +-0.54052097 was 14 years +-0.7754742 next 3 years +-0.7038017 short 4 years +-0.54052097 their senior years +-0.54052097 finished 16 years +-0.7754742 around 40 years +-0.54052097 to 45 years +-1.2653071 law , medicine +-0.96822786 politics , medicine +-1.6614695 years of medicine +-0.96471906 claim with three +-1.9436862 For three +-0.5436244 I have three +-1.1013486 the following three +-0.73452914 these following three +-2.2103217 There are three +-0.90312314 fll give three +-1.0172796 Reason three +-1.7671621 is a lesson +-0.958312 useful life lesson +-2.0030637 they can before +-1.9519901 work , before +-0.96871805 circle and before +-1.6398749 of work before +-1.2293975 their work before +-0.9609232 saved money before +-1.3539121 extra-curricular activities before +-0.9570644 complicated life before +-0.8627178 never used before +-0.8171302 well-rounded lifestyle before +-0.8870965 a year before +-0.9312865 few years before +-1.8485878 is to enter +-2.12691 have to enter +-0.9628405 wants to enter +-1.7884638 when they enter +-1.4344765 once they enter +-1.518025 before they enter +-0.8774813 new graduates enter +-1.6489092 when we enter +-0.94926876 making it taste +-1.4889755 makes it taste +-2.068489 get a taste +-1.2631534 enjoy the taste +-0.96713984 alters the taste +-1.5454549 their first taste +-1.4303986 It can come +-1.9099971 is to come +-1.2646344 ; students come +-0.9570584 that may come +-1.7942575 students who come +-0.95512146 That would come +-0.7768402 reckoning shall come +-1.803115 job and manage +-2.39747 able to manage +-0.8502425 learning to manage +-1.539588 do n't manage +-0.9678614 manage to secure +-0.9678614 hopes to secure +-2.0018044 that it means +-1.1831028 if it means +-0.73083824 what it means +-1.9546864 have the means +-0.96597487 then have means +-0.94977397 `` which means +-0.5414925 Independent means +-2.1276162 are not relying +-1.6721942 rather than relying +-0.9585228 supported by organizations +-0.96306306 support from organizations +-1.7823436 the world lying +-0.54166937 world lying beyond +-0.7054753 new worlds beyond +-0.54166937 Anything beyond +-1.7780188 the college gates +-2.3770523 a student outside +-2.3117998 part-time jobs outside +-0.95834094 with an outside +-1.6135128 social skills outside +-1.1169219 be learned outside +-1.4102702 with people outside +-1.6171986 of life outside +-0.9247135 those living outside +-0.8624685 to live outside +-0.77661157 student opinions outside +-0.9473217 immediate social circles +-0.9649377 universities will meet +-0.96979755 circles and meet +-2.4032547 students to meet +-2.0597405 have to meet +-1.7298238 opportunity to meet +-1.244203 just to meet +-1.9707899 student fs meet +-0.5414925 would undoubtedly meet +-0.964565 entering a new +-1.2580774 creating a new +-2.5949848 of the new +-0.9660607 become the new +-2.1288831 This new +-0.75286186 understanding something new +-0.75286186 want something new +-0.9419689 time learning new +-0.8760961 To explore new +-0.7118446 and meet new +-0.94232285 to meet new +-0.77623254 could discover new +-0.9328868 , our new +-0.8173369 and creating new +-0.5409623 be encountering new +-0.96602863 money it offers +-1.2315707 that work offers +-1.8841059 part-time work offers +-1.890775 also be considered +-2.3865998 should be against +-0.970251 crisis , against +-0.8624685 am completely against +-0.95828134 of experience against +-0.9445638 than there against +-1.540733 it 's against +-0.6983757 I am against +-0.5410506 be considered against +-0.70457333 am absolutely against +-1.7330232 to smoke against +-0.92509264 heavy course load +-0.96616054 college work load +-1.2950244 a full load +-0.8422966 a heavy load +-2.0907428 will be much +-1.2624958 it that much +-2.3570333 is a much +-2.4968066 have a much +-1.8368138 student is much +-1.7211828 time is much +-0.949683 environment is much +-1.657977 classes , much +-0.9677806 concepts in much +-0.8924813 students as much +-0.8924813 put as much +-0.8924813 require as much +-0.8924813 learn as much +-0.77728224 also so much +-0.94425076 have so much +-0.77728224 learned so much +-0.77728224 because so much +-1.6314275 has become much +-1.1784327 work environment much +-1.4256423 jobs provide much +-1.0998782 Too much +-0.75326735 is too much +-0.60630846 on too much +-0.60630846 parents too much +-0.60630846 If too much +-0.87516296 Japanese universities much +-0.7036732 makes dating much +-0.54043275 could stem much +-0.54043275 with cared much +-0.9577736 work may hinder +-1.5466845 more than hinder +-0.96840984 hinder their performance +-1.4930594 their academic performance +-0.5415809 in improved performance +-0.5415809 the efficient performance +-1.6525099 money for books +-1.44468 tuition , books +-2.1587815 to their books +-1.7854307 from their books +-0.962452 only from books +-0.81796503 obtaining needed books +-0.7050885 students purchase books +-2.094723 be a hindrance +-2.4024405 I worked +-1.6347499 , I worked +-1.56327 discipline and worked +-0.95094866 and only worked +-0.96568465 parents have worked +-0.9436016 I myself worked +-0.7345763 I never worked +-0.9462086 have never worked +-0.9271512 world really worked +-1.2160836 drinking with my +-0.94269454 continued with my +-1.5704706 reasons for my +-1.7261014 pay for my +-0.94309974 enough for my +-2.439045 it is my +-2.052978 It is my +-1.2643021 said , my +-2.007056 In my +-1.6873509 , in my +-0.9360457 so in my +-0.9360457 keep in my +-0.9360457 things in my +-1.373809 top of my +-1.3866109 one of my +-0.9455529 Some of my +-0.9455529 weeks of my +-0.9640207 importance to my +-1.5423869 start to my +-1.6068672 focus on my +-0.89001435 up on my +-1.310997 based on my +-0.89001435 back on my +-1.7287846 to pay my +-0.95852464 work or my +-0.94982773 following are my +-1.2295682 These are my +-0.91390693 enjoy taking my +-1.4741772 to put my +-1.5750062 , when my +-1.0436957 Of my +-0.9211468 cover all my +-0.9211468 covered all my +-1.7063323 it fs my +-1.3604625 growing up my +-0.9328818 much about my +-0.7074367 that support my +-0.9552516 to support my +-0.7074367 shall support my +-0.8976074 myself worked my +-0.88319516 nor did my +-1.3304244 to balance my +-1.123929 I spent my +-1.3958049 I had my +-0.88247085 out spending my +-0.5388479 During my +-0.5388479 to organize my +-0.8368516 , reduces my +-0.70136726 job throughout my +-0.70136726 still remember my +-0.70136726 disadvantage versus my +-0.5388479 will voice my +-0.5388479 seriously jeopardize my +-1.4619812 as I did +-0.9215383 like I did +-0.9215383 Perhaps I did +-0.9653459 had not did +-1.0915376 , nor did +-2.0917118 It did +-0.8445425 degree certainly did +-0.81796503 I finally did +-2.1530933 I really +-1.9238683 , I really +-0.9206844 could I really +-0.93895787 are often really +-1.5645849 hard to really +-0.9499581 people only really +-1.1390803 don ft really +-1.3736824 the food really +-0.9076451 did n't really +-0.95577973 There fs really +-1.4472752 I am really +-1.7677474 real world really +-0.9452042 have never really +-0.733959 could never really +-1.1411238 or she really +-0.9357073 right through until +-0.88775027 humble position until +-0.9216024 the way until +-0.7769962 not diligent until +-0.8177979 world h until +-0.34219918 can wait until +-0.34219918 to wait until +-0.9339082 or restaurants until +-1.0482398 the last weeks +-0.9446826 a lower workload +-0.9579728 when my workload +-1.5643476 I was +-1.485258 , I was +-0.821154 college I was +-1.010323 : I was +-0.67054653 when I was +-1.1487457 because I was +-1.0975442 When I was +-1.4285897 If it was +-1.4405235 class and was +-1.4298608 and work was +-0.9493093 degree which was +-1.8653182 college education was +-0.7048308 my workload was +-0.9135421 and he was +-0.92548186 workload was huge +-2.0069602 they can go +-2.0894058 do not go +-0.9692575 things and go +-1.4632062 reasons to go +-1.3634865 it to go +-1.5362816 has to go +-1.9889746 able to go +-1.5491309 enough to go +-1.2141695 afford to go +-1.7524542 want to go +-0.9677842 why students go +-0.9469285 everywhere you go +-1.1796057 students must go +-0.8626441 workers must go +-0.8766697 will actually go +-0.96559507 Therefore I conclude +-0.9709195 like to conclude +-1.6219896 work part-time now +-0.96906173 Communication is now +-0.959694 Most people now +-0.70521736 I conclude now +-1.0777895 young adults now +-2.0994792 the student needs +-1.3917652 college student needs +-1.3115649 most cases needs +-0.6179717 their financial needs +-0.86294854 have different needs +-0.9269427 she really needs +-0.8441873 very basic needs +-1.765013 in Japan needs +-0.8441873 support his\/her needs +-2.1887593 with the costs +-2.3057199 , the costs +-1.9866488 of the costs +-1.2680655 benefits and costs +-0.94961387 because studying costs +-0.91394144 ever-increasing tuition costs +-0.7050885 ever increasing costs +-0.951495 good education costs +-0.9702419 To a certain +-2.0661583 In certain +-0.9708217 leads to certain +-2.04629 I understand +-2.0855575 do not understand +-1.648014 enough to understand +-0.96690404 h to understand +-1.7858365 help students understand +-1.4049987 I also understand +-0.9661646 do they understand +-0.6183546 will better understand +-0.9473955 and could understand +-1.3986168 students would understand +-0.895424 not fully understand +-0.45457235 for that matter +-2.531173 is a matter +-1.5577679 should spend every +-1.6119055 of people every +-1.1423458 Not every +-0.92487025 like Japan every +-0.8964715 benefit me every +-1.2583088 family can afford +-1.6501087 can not afford +-0.94916594 could not afford +-1.7023263 able to afford +-1.8694725 to help afford +-1.3271807 I really afford +-0.8778863 can afford maintaining +-0.96877724 maintaining their son +-0.9653139 son or daughter +-2.4867222 able to dedicate +-0.86392206 to entirely dedicate +-0.94063544 may make him +-1.4932024 to support him +-0.34232748 to dedicate him +-0.34232748 entirely dedicate him +-1.0709139 to prepare him +-0.4538205 him or herself +-0.9307304 studies ; moreover +-1.6483059 reason for getting +-0.9679221 family is getting +-1.8586749 First , getting +-0.9683588 through and getting +-0.9564748 ways of getting +-0.9564748 responsibly of getting +-1.4037303 risk of getting +-0.9696556 comes to getting +-0.9613394 ranging from getting +-1.2601715 college are getting +-0.9374052 worried about getting +-1.6599497 rather than getting +-0.8433006 others though getting +-1.1304172 not just getting +-0.77592903 and essentially getting +-0.970158 higher and higher +-1.259375 cases of higher +-0.96522456 institutions of higher +-0.9253135 are getting higher +-0.96934706 period is extreme +-1.9030274 , in extreme +-0.897185 a rather extreme +-2.4765916 important for maturing +-0.9488822 course could consume +-2.8185043 of the momentum +-1.3827922 that could preciously +-2.2209737 There are innumerous +-0.4542107 know that hard +-2.164734 It is hard +-1.2663919 beyond the hard +-0.9697437 smart , hard +-2.5866687 , and hard +-2.182258 value of hard +-1.2408347 to study hard +-0.94821346 by studying hard +-0.8842865 work very hard +-0.8842865 worked very hard +-2.2825007 to work hard +-0.95162916 working so hard +-1.3550365 and how hard +-0.8164635 money demands hard +-0.8164635 it becomes hard +-0.92548573 often really hard +-1.2703568 let the so-called +-1.5622797 during their so-called +-0.96979755 there and back +-2.375778 they are back +-0.7771444 that giving back +-1.8605232 I think back +-0.5414925 to run back +-1.4436433 books and classrooms +-2.314604 to their classrooms +-1.9546864 by the lack +-1.9977528 lead to lack +-2.3051038 The lack +-0.96676177 classrooms they lack +-0.964643 delay or lack +-0.97025436 impression of power +-1.4019995 the necessary power +-0.970705 power to concentrate +-2.0453026 students should concentrate +-1.5126889 we should concentrate +-1.3610439 student must concentrate +-0.86383766 and instead concentrate +-0.9585228 them by active +-1.2490816 take an active +-0.70563835 an active role +-1.5680019 up the process +-0.86376655 his\/her educational process +-1.2172534 the learning process +-0.95166093 fs some ! +-0.9562895 agree more ! +-0.8620951 your loans ! +-0.8950865 say never ! +-0.81679666 of labor ! +-0.77592903 right soon ! +-1.0036056 I wanted ! +-1.0750585 bad habits ! +-0.70418733 : wrong ! +-0.54078573 I guess ! +-0.81679666 done properly ! +-0.54078573 Never ! +-0.54078573 , unite ! +-1.4396083 only for resting +-0.9348925 see themselves acquiring +-0.88864374 acquiring knowledge exclusively +-1.438035 only for poor +-2.087338 be a poor +-1.8638264 First , poor +-1.8454288 from their poor +-0.7050885 also mean poor +-0.94361335 affordable then poor +-0.96934706 important is participation +-1.8055009 , then participation +-0.86376655 their poor participation +-1.2623075 no time dedicated +-1.8083861 A solution +-0.7775762 rather extreme solution +-1.5220007 be more scholarships +-0.9703562 receive a government +-2.2101698 , the government +-2.3378284 of the government +-1.9232823 from the government +-0.9566298 namely the government +-1.2687647 least a system +-2.7886145 of the system +-0.5414925 monetary enslavement system +-0.7771444 the economic system +-0.5414925 national healthcare system +-2.4993913 , and allow +-1.4293842 fees and allow +-0.949929 income which allow +-1.4012885 that would allow +-0.87743556 would simply allow +-0.9708217 her to practice +-0.59240544 put into practice +-0.87460804 classrooms into practice +-0.7054753 This whole practice +-2.0678182 In comparison +-0.9694723 pales in comparison +-0.9434406 activities that young +-1.4691062 understand that young +-1.3681695 fact that young +-1.9697012 , a young +-2.011876 in a young +-1.984488 of a young +-1.4622189 from a young +-0.94137615 Should a young +-1.7770138 for the young +-1.9466602 college , young +-1.2067869 Too often young +-1.4016753 for many young +-1.1651682 that many young +-1.3628974 for any young +-0.9690737 applies to young +-0.9635135 influence on young +-0.94493 1 -RRB- young +-1.2879872 Most young +-1.6971351 They are young +-0.9528733 shoppers are young +-0.88615644 part-time gives young +-1.4157193 will help young +-1.5735474 to help young +-0.9314969 know other young +-1.6656474 for all young +-2.0288048 If young +-1.366492 will give young +-1.2250458 experience as adults +-1.7177081 well as adults +-0.8880774 become responsible adults +-0.87452316 , young adults +-1.0958514 are young adults +-0.8182215 , mature adults +-0.5414925 for younger adults +-2.0372965 They tend +-1.0784609 young adults tend +-1.8474624 student is always +-1.4951766 money is always +-0.95104927 someone is always +-0.9464351 focus will always +-0.9464351 world will always +-1.7109442 is not always +-1.1829293 but not always +-1.1829293 's not always +-1.9731001 that students always +-1.6444814 and are always +-1.1542833 ca n't always +-1.0479476 I was always +-0.9653139 education or specialisation +-1.7852731 their college counterparts +-1.2709996 Generally , non +-2.1995287 have to adapt +-2.1114297 them to adapt +-2.4534755 in a relatively +-2.630871 to the relatively +-0.96844935 why that short +-1.3795258 a very short +-0.70534635 a relatively short +-0.5415809 , severely short +-2.0342643 be a problem +-2.2533517 of a problem +-0.9669855 week the problem +-0.9669855 aside the problem +-0.9707589 initiative , problem +-2.3051038 The problem +-1.4044609 The main problem +-1.2558315 learning , since +-0.9634209 problem , since +-0.9634209 easier , since +-0.9657593 our work since +-0.88761127 to graduate since +-0.81792647 more wisely since +-0.92150915 every day since +-0.8444742 in clubs since +-0.8177979 rational approach since +-0.95863444 since this change +-2.057076 to a change +-1.5446997 This will change +-0.9703549 needing to change +-1.6166931 it may change +-0.70495963 -RRB- quickly change +-1.0051459 can easily change +-1.416526 is more manageable +-2.2938926 to be concerned +-0.970158 active and concerned +-2.074859 the work concerned +-0.9281723 usually less sophisticated +-1.5216873 be more sophisticated +-1.270913 them , whilst +-2.8064618 part time whilst +-0.70563835 time whilst furthering +-1.5322751 a valuable insight +-1.5653945 experience and insight +-1.2871864 the added insight +-0.5415809 of pecuniary insight +-0.93973964 insight about assimilation +-2.1505525 This bridging +-2.1209922 is the effect +-1.2475339 have an effect +-0.8445425 The immediate effect +-0.54140407 This bridging effect +-0.54140407 a detrimental effect +-0.54140407 a definite effect +-2.7458208 part-time job enables +-0.96629435 we work enables +-0.93946695 effect better enables +-0.9473076 did -RRB- quickly +-0.84262145 to adapt quickly +-2.3159316 in their leap +-0.92226976 counter-argument : Working +-0.94926876 working it teaches +-1.5904769 because it teaches +-0.96997774 community and teaches +-2.8031495 part time teaches +-1.6538267 It teaches +-0.968682 employment that theory +-2.814478 of the theory +-1.2649939 important that universities +-1.7982335 parents and universities +-1.6847119 can help universities +-0.9138709 most Japanese universities +-0.8449086 theory based universities +-0.8875358 most large universities +-0.70495963 most U.S. universities +-1.557113 student will likely +-0.94687754 possibilities will likely +-0.9277999 far less likely +-0.94690406 would most likely +-1.7113326 that are likely +-1.5929402 jobs are likely +-0.93973964 them about budgeting +-1.4847132 , work ethics +-1.32818 days work ethics +-0.92796654 Basic work ethics +-2.4236782 in a workplace +-1.5308905 In the workplace +-1.7190583 in the workplace +-1.9797235 of the workplace +-0.9701495 ethics , workplace +-1.438929 class and workplace +-1.5620195 variety of workplace +-0.96727586 into their workplace +-2.2802162 The workplace +-0.8867519 ereal f workplace +-1.6845045 the future workplace +-0.77623254 build successful workplace +-1.1628797 to understand workplace +-1.270913 management , productivity +-0.970309 quality and productivity +-0.8453008 a practical manner +-0.9514415 on these issues +-0.9696175 issues and instead +-1.5391097 time working instead +-2.1208038 their studies instead +-0.88785994 for earning instead +-1.2327487 an education instead +-0.54140407 lucrative tips instead +-2.3714485 on the core +-0.97025436 image of material +-1.0861467 the course material +-0.8150853 core course material +-1.9789475 for their respective +-0.8426969 We hope +-0.9687079 as their principle +-2.3173888 The principle +-2.3635976 on the concern +-0.97001797 regardless of concern +-0.8635808 a major concern +-0.70534635 their principle concern +-1.5127802 , by definition +-0.9224694 then by definition +-0.9703562 train a worker +-1.6751995 a full-time worker +-1.2610804 So it follows +-1.7773734 dealing with problems +-1.2524239 have money problems +-0.8304509 any health problems +-0.8304509 to health problems +-0.5414925 more serious problems +-0.5414925 other respiratory problems +-2.5074086 it is perhaps +-0.96186733 Japan is perhaps +-0.9647839 income will perhaps +-1.4363422 is , perhaps +-0.96784025 Others , perhaps +-1.7278193 some students perhaps +-1.5439681 class or perhaps +-0.96692693 Some are perhaps +-1.7346755 become a fully +-2.0894058 do not fully +-1.2678394 efforts of fully +-1.569535 to become fully +-0.8701349 they become fully +-0.9640272 partly or fully +-2.363772 they are fully +-1.4131423 they graduate fully +-0.8873397 are perhaps fully +-1.7339616 students and families +-1.803518 with their families +-1.6200986 by their families +-0.94521844 assist their families +-1.3884045 and most families +-1.6961809 Many families +-0.93747365 places where families +-0.5413157 from poorer families +-0.70495963 from wealthy families +-1.119492 , perhaps older +-1.2617884 , have saved +-2.1653402 in their courses +-0.9594985 before their courses +-0.54166937 of college-level courses +-0.54166937 them failing courses +-1.0065018 their courses began +-0.96537435 many will fall +-1.6496032 available to fall +-2.1107414 them to fall +-0.7776412 will fall somewhere +-0.96880597 somewhere in between +-0.7048308 middle ground between +-0.54122734 short span between +-0.9027323 right balance between +-1.2098205 to choose between +-0.8441873 a choice between +-0.54122734 is split between +-0.84159017 be divided between +-1.6395895 a little bit +-0.9703093 bit of book +-0.9683228 waters that hold +-0.93780583 gainful employment hold +-2.0663214 how to hold +-0.96187824 does working hold +-0.9619357 he should hold +-2.1560578 them to escape +-1.7510254 as an escape +-2.069276 from the confines +-1.2382638 and school worlds +-0.925402 explore new worlds +-2.296603 to be brought +-1.3827262 that has brought +-2.1833723 It is far +-1.6967614 from working far +-0.9058511 them so far +-0.9058511 go so far +-1.7206147 there are far +-0.8181321 you fre far +-2.1568217 them to dip +-0.96877724 dip their toes +-1.7421509 into the unfathomable +-0.5417812 the unfathomable waters +-1.134073 become fully immersed +-0.96822786 dormitories , preparing +-0.96822786 wisdom , preparing +-0.54174346 Thereby preparing +-2.2509236 have to traverse +-1.251237 for the rest +-1.2634587 through the rest +-1.9140763 is to rest +-0.7774487 get proper rest +-1.2691678 enrich the lives +-0.9305558 that their lives +-1.8741325 in their lives +-1.3254359 of their lives +-2.0355797 on their lives +-0.45301723 their working lives +-0.9401783 people 's lives +-0.8430187 in our lives +-0.8430187 All our lives +-1.0471783 their daily lives +-0.96313107 Wages from gainful +-0.9703562 student a sample +-1.7896512 opportunity to sample +-1.5569246 ability to sample +-0.9708349 sample the delights +-0.93937266 they often want +-0.96757615 spoiled students want +-0.95661545 We may want +-0.91044164 but they want +-0.719741 what they want +-0.91044164 When they want +-0.91044164 anyway they want +-1.4913878 , you want +-0.9525968 non-smokers who want +-1.7332808 don ft want +-0.9440902 All parents want +-0.65153563 employer might want +-0.65153563 we might want +-0.9525061 whenever we want +-1.2642909 decide to try +-1.8968899 want to try +-1.2534233 and should try +-0.95377535 Those who try +-0.90151215 do several things +-1.2074438 many valuable things +-1.8003381 life , things +-0.62466204 learn many things +-1.5314252 Such things +-1.4258283 and have things +-1.5564198 These things +-0.9402063 more good things +-1.136229 many other things +-0.84119564 about other things +-0.90236163 learning difficult things +-0.8236918 do these things +-0.8236918 done these things +-0.8236918 appreciate these things +-0.94098914 try out things +-1.328932 for those things +-0.81267905 learning new things +-0.81267905 encountering new things +-0.54043275 many interesting things +-0.7753227 can buy things +-0.54043275 the obligatory things +-1.2838506 that were previously +-0.5417812 were previously unavailable +-0.8544539 a well rounded +-0.8544539 A well rounded +-1.1339254 become fully rounded +-2.153538 , it takes +-1.4992511 what it takes +-2.338211 a job takes +-2.1335695 a student takes +-1.9152932 college student takes +-1.3669456 , learning takes +-0.70521736 the side takes +-1.6421189 They will hopefully +-0.94714874 education will hopefully +-1.270913 expenses , hopefully +-2.087095 they will appreciate +-0.9696175 understand and appreciate +-1.8468289 is to appreciate +-2.2839632 able to appreciate +-1.4261363 likely to appreciate +-1.7885827 help students appreciate +-0.93883246 student better appreciate +-0.9438248 will hopefully appreciate +-2.3892295 they are paying +-1.7903471 for the next +-1.9156256 , to round +-0.84262145 the next round +-0.9703093 round of drinks +-0.970309 country and definitely +-0.9480326 should very definitely +-2.540267 , the harder +-0.95623523 them study harder +-0.94507897 that much harder +-0.90357953 and worked harder +-0.9650865 environment can suffer +-1.865048 learn to suffer +-1.2609593 And it wo +-0.9223168 selfish customers wo +-1.1561959 wo n't kill +-2.250717 can be fun +-1.4727278 and having fun +-2.4968479 to have fun +-2.1597657 students are mostly +-0.96979755 food and basic +-1.225114 the very basic +-2.1506422 to do basic +-0.5414925 are mostly basic +-0.5414925 of mastering basic +-2.5457652 , the simple +-0.970158 basic and simple +-1.4513023 , doing simple +-2.5348375 , the tasks +-1.0056605 even small tasks +-0.4701193 and simple tasks +-0.4701193 doing simple tasks +-0.5414925 and prioritize tasks +-0.8421199 to perform tasks +-0.9710474 dropped , suffering +-1.9545219 work and suffering +-0.97001797 virtues of labor +-0.96840984 shops their labor +-0.77729654 doing menial labor +-0.5415809 or manual labor +-0.909759 is n't fatal +-1.5272865 But timing +-0.9653459 why not Japan +-0.86403596 study in Japan +-0.7389712 students in Japan +-1.4307592 work in Japan +-0.86403596 issue in Japan +-1.0785313 especially in Japan +-0.86403596 policy in Japan +-0.86403596 efforts in Japan +-0.86403596 Currently in Japan +-0.86403596 establishments in Japan +-0.86403596 restaurants in Japan +-0.42900413 smokers in Japan +-0.97047156 came to Japan +-0.9246755 society like Japan +-1.0054032 , including Japan +-1.8594404 students in America +-1.8920971 work in America +-0.34240606 in North America +-0.34240606 swept North America +-1.8973225 pay for rent +-2.540267 , the rent +-1.2649392 fees , rent +-0.9680422 books , rent +-0.5415809 my monthly rent +-1.7434075 people , married +-0.9534694 usually get married +-0.9632594 high-school student generally +-0.9694374 courting and generally +-2.0156863 part-time work generally +-2.1462479 students are generally +-1.7660506 in Japan generally +-0.9087888 young person generally +-0.966677 customers the freedom +-0.966677 wife the freedom +-0.97055566 are , freedom +-0.96892005 thinking in freedom +-1.5489037 taste of freedom +-0.96471596 sort of freedom +-1.5226834 to experience freedom +-0.9574894 little more freedom +-1.1416409 With freedom +-1.5687809 In the sophomore +-0.94732565 So most sophomores +-1.9593399 by the 4th +-2.4505625 in a service +-0.7774487 , community service +-0.7774487 , customer service +-1.9779186 student fs prime +-2.1209922 is the purpose +-1.0436072 the main purpose +-1.0436072 The main purpose +-0.54140407 fs prime purpose +-0.54140407 the sole purpose +-1.0175442 the entire purpose +-0.54140407 the ultimate purpose +-0.5417812 Cleaning toilets +-0.9653139 toilets or serving +-0.5417812 or serving burgers +-1.2592678 to not waste +-0.86302435 and completely waste +-0.9703549 apt to waste +-1.3029518 a real waste +-1.666686 rather than waste +-0.88761127 a complete waste +-0.5413157 Why waste +-0.96347237 I think +-1.4211972 , I think +-1.1343137 and I think +-0.7994673 so I think +-1.1038669 because I think +-0.97724015 which I think +-0.7994673 So I think +-1.0574335 When I think +-1.3443974 do not think +-2.1900768 have to think +-2.0280533 how to think +-1.4066516 I also think +-0.94711846 ft you think +-1.5233672 people who think +-1.140386 don ft think +-0.96829027 wanted for anything +-1.2699327 success , anything +-2.1465714 to do anything +-2.0534644 If anything +-0.9526682 or get anything +-0.9426427 point learning anything +-0.949503 really learn anything +-1.2495443 Some people argue +-1.40211 students would argue +-2.0677173 how to serve +-1.8195249 can also serve +-1.4969108 will only serve +-1.5567433 to go serve +-0.9653139 restaurant or clean +-0.95346725 clean some dishes +-0.54174346 are washing dishes +-0.96873605 but that pales +-0.943531 with learning international +-0.9977236 against the law +-0.9708606 medicine , law +-2.0306695 such as law +-0.5415809 learning international law +-0.9652262 law or advanced +-0.939651 right about advanced +-0.70563835 or advanced mechanical +-2.700372 I focused +-2.4066699 should be focused +-2.0149891 student is focused +-0.96979755 concentrated and focused +-0.7771444 is heavily focused +-2.397405 , I truly +-0.96934706 profession is truly +-2.4837198 able to truly +-1.2084849 several reasons why +-0.7522385 many reasons why +-1.0312898 of reasons why +-1.1274343 two reasons why +-1.637866 this is why +-1.2324202 That is why +-0.95132303 cwhich is why +-1.200659 only reason why +-0.9533981 smoking so why +-0.877244 really see why +-0.9538444 first be made +-1.7025523 could be made +-1.2592678 have not made +-1.22635 one has made +-0.9575156 , also made +-1.4300328 They have made +-1.180395 the effort made +-0.92435646 it was made +-0.95854473 made more affordable +-1.8875811 who are smart +-0.9653139 working or gifted +-0.9657898 full and proper +-0.9657898 meet and proper +-1.8489242 to get proper +-1.88799 , to reduce +-0.96748304 colleges to reduce +-0.9509391 may help reduce +-0.9482944 , could reduce +-0.8634614 -RRB- loans reduce +-0.70521736 would significantly reduce +-1.4847552 because I firmly +-1.3922634 why I firmly +-2.4202912 should be encouraged +-1.2691407 hard and reach +-1.8378661 as they reach +-1.6497928 when we reach +-0.938292 time doing meaningless +-0.9710908 meaningless , unchallenging +-0.97038597 unchallenging and pointless +-1.5602461 opportunity for anyone +-2.2226095 believe that anyone +-0.9582222 upon by anyone +-1.853765 time in anyone +-1.2370737 not so fortunate +-1.6328001 I was fortunate +-0.8184666 for anyone fortunate +-2.0652468 student to attend +-1.6886886 able to attend +-1.5629126 good for everyone +-1.1433476 Not everyone +-1.851487 their time wisely +-0.95155805 using time wisely +-2.1841552 they have wisely +-0.9580905 money more wisely +-1.3702371 and spend wisely +-1.2691407 degree and moving +-0.9304497 aren ft moving +-0.90961 can before moving +-0.9709195 choice to refrain +-2.2863927 students should refrain +-1.1785548 expanding your circle +-0.54174346 a vicious circle +-2.1590135 college student knows +-0.54174346 Everyone knows +-0.94724613 their most precious +-0.9513582 during these precious +-0.70563835 most precious commodity +-0.96005386 Consider an average +-0.95838577 time by 20 +-1.810164 , working 20 +-1.7359891 number of hours +-0.9611777 excessive working hours +-1.23595 during school hours +-1.3467282 not enough hours +-0.21872002 20 hours +-0.21872002 by 20 hours +-0.21872002 working 20 hours +-0.5410506 's 25 hours +-0.7760148 a few hours +-0.8762872 have long hours +-0.77638435 or 40 hours +-0.5410506 play 24 hours +-1.5696235 hours , 5 +-0.96308905 my student days +-0.9597576 their college days +-1.1794721 fs college days +-0.44393384 my college days +-0.4182338 students these days +-0.82500273 like these days +-0.82500273 costs these days +-0.86294854 2 free days +-1.515921 on my days +-0.7048308 , 5 days +-1.1412889 Those days +-0.7048308 to five days +-0.9507646 more a week +-0.45052022 hours a week +-0.9507646 days a week +-0.9507646 nights a week +-0.95623523 day study week +-0.21932037 hours per week +-0.34232748 lost per week +-0.5415809 a 5-day week +-2.6843307 , and ready +-0.84262145 Getting ready +-1.955224 work and driving +-0.9459181 result can easily +-0.9459181 back can easily +-1.2273366 This could easily +-0.8776273 that quit easily +-0.9221157 an extra hour +-0.94633484 into after hour +-0.9603899 Every hour +-0.5415809 every waking hour +-0.96889466 applications for each +-0.8186068 extra hour each +-1.6089951 in this day +-0.96731174 lunch that day +-1.8614057 working a day +-2.407579 in the day +-2.5837984 of the day +-0.9522205 college one day +-1.490511 the following day +-0.9577573 standing all day +-0.8620951 single free day +-0.8433006 me every day +-0.70418733 hour each day +-0.77592903 individual whose day +-0.54078573 Their day +-0.54078573 a 6 day +-1.2130678 That 's 25 +-1.6469564 study time lost +-0.54174346 productivity causing lost +-1.0499727 20 hours per +-0.9710284 few hours per +-0.7055833 time lost per +-2.5041761 to the risk +-0.9672942 running the risk +-0.9615076 most at risk +-1.1784706 a high risk +-1.6415124 job will cause +-1.474545 job may cause +-0.919685 This may cause +-2.0572884 would be added +-2.2101698 , the added +-1.4673352 and the added +-0.9566298 gain the added +-0.9566298 without the added +-0.97025436 feelings of frustration +-1.2880919 the added frustration +-0.8426969 Maintaining friendly +-1.2598004 relationships with coworkers +-0.970309 boss and coworkers +-0.96952754 coworkers is bound +-2.0020587 may be tempting +-2.4202912 should be urged +-0.9477062 employment as soon +-1.2255371 jobs as soon +-1.4927332 work force soon +-0.9221495 that right soon +-1.1785105 reasons : First +-1.2695354 being a dependent +-0.8969924 are fully dependent +-0.70563835 a dependent child +-0.96952754 adult is rapidly +-0.5417812 is rapidly underway +-2.0192907 students have moved +-0.96928775 purchase a home +-1.4413908 outside the home +-1.7542026 away from home +-0.8624685 they leave home +-1.1253586 or at home +-0.8920071 staying at home +-0.8920071 live at home +-1.650821 they get home +-0.77638435 of staying home +-1.3181838 the family home +-0.9249859 or going home +-0.84399164 run back home +-1.2175702 for having responsibility +-2.25803 with the responsibility +-2.402901 , the responsibility +-0.97004807 discipline , responsibility +-0.9640375 value and responsibility +-1.5424502 discipline and responsibility +-1.6151173 understanding of responsibility +-1.5981817 sense of responsibility +-1.4040796 terms of responsibility +-1.20399 and financial responsibility +-2.1830285 , or responsibility +-1.3422443 to take responsibility +-0.84213525 must take responsibility +-0.9453825 our social responsibility +-1.0462049 it teaches responsibility +-1.0453346 is certainly responsibility +-0.70431596 freedom comes responsibility +-1.2709996 lives , ranging +-2.103848 going to bed +-0.9672094 To be sure +-0.87772065 to making sure +-1.7999148 to make sure +-0.9687079 sure their homework +-0.852821 , doing homework +-0.65699667 from doing homework +-0.65699667 or doing homework +-0.65699667 spent doing homework +-0.9694666 homework is completed +-0.8186068 not properly completed +-1.197968 of university dormitories +-0.9448233 , preparing breakfast +-0.9709622 breakfast , lunch +-0.9615076 it at lunch +-1.1439165 to eat lunch +-0.97038597 lunch and dinner +-2.0682049 , they never-the-less +-0.9650865 he can build +-0.96305114 one to build +-1.4224052 necessary to build +-1.5387259 trying to build +-0.96005386 build an identity +-0.96559507 responsibilities I fve +-0.9480385 until you fve +-1.2606536 were not mentioned +-0.7054753 I fve mentioned +-0.8184666 reasons above mentioned +-2.712588 I admit +-1.4073776 this may seem +-0.9596274 They all seem +-0.9709693 lot to handle +-1.786465 for a person +-1.8806763 , a person +-1.8957257 in a person +-0.92811286 chance a person +-0.92811286 developing a person +-0.92811286 assets a person +-0.92811286 affect a person +-0.97047156 person to person +-0.962452 vary from person +-1.6528003 a better person +-1.5462832 a young person +-0.9438248 well rounded person +-0.86400133 their late teens +-0.9656202 getting it early +-0.92719316 from an early +-1.3262391 at an early +-0.964643 teens or early +-0.8634614 people especially early +-0.9429879 burn out early +-0.8453224 or early twenties +-1.5250006 understand that adding +-1.7817816 think that adding +-1.4629169 By adding +-1.2492541 provide an unnecessary +-1.2704892 but the contrary +-2.6336813 to the aforementioned +-1.2452129 are my list +-0.54174346 the aforementioned list +-0.9703093 benefit of remuneration +-0.9146115 the personal satisfaction +-1.2675868 student in gaining +-0.97014666 satisfaction of gaining +-2.3839705 they are gaining +-2.4883869 , and independence +-1.2591404 confidence and independence +-1.858969 amount of independence +-0.97047156 dependence to independence +-0.90427047 of financial independence +-0.7496675 gaining financial independence +-0.7496675 brings financial independence +-0.94022846 assert his independence +-0.54140407 of fiscal independence +-1.657614 life is enormous +-1.9542363 is an enormous +-1.2883024 the added self-esteem +-0.9650865 self-esteem can manifest +-1.0481644 will likely manifest +-0.96952105 itself in improved +-1.8027575 and a growing +-1.565553 way of growing +-1.6317763 I was growing +-0.5415809 Many youngsters growing +-1.6614695 sense of securing +-0.9709195 key to securing +-2.0021756 However , dating +-0.8970996 it makes dating +-0.970309 faster and easier +-1.1014801 a much easier +-0.8778956 dating much easier +-1.4517479 have to ask +-0.95792127 Let fs ask +-0.97025436 point of view +-2.0372965 They view +-2.5275056 is a big +-0.9569647 Australia many big +-0.95380145 as one big +-0.7775762 one big party +-0.7055833 a third party +-0.9707589 out , drinking +-0.96979755 partying and drinking +-1.2166344 go out drinking +-0.9377964 meetings where drinking +-0.5414925 of binge drinking +-1.2462791 college by joining +-0.970309 drinking and joining +-2.7886145 of the clubs +-1.2671331 participation in clubs +-0.94670403 joining social clubs +-0.70521736 by joining clubs +-1.3841803 All clubs +-2.3892295 they are washing +-1.4321926 studying can create +-1.3551531 In Australia +-0.96546304 Australia I studied +-0.9252456 subject being studied +-1.6422194 may have studied +-2.1266 at the University +-0.9691483 knew in University +-2.4359963 of their University +-0.5414925 of Northeastern University +-0.9703093 University of Technology +-0.5417812 of Technology Sydney +-0.90978175 There were 8,000 +-0.54174346 75 % +-0.54174346 concentrate 100 % +-0.96559507 students I knew +-1.7314224 because they knew +-0.95956486 lost for companies +-1.2483029 essential for companies +-0.95619273 with many companies +-0.96892005 only in companies +-0.87702936 to change companies +-0.7768402 many big companies +-0.8444742 and public companies +-0.5413157 credit card companies +-0.8885696 big companies hire +-2.0374699 order to test +-0.9446826 the perfect test +-1.8977814 for their character +-0.95969945 out their character +-1.9648479 be a positive +-2.2585597 is a positive +-2.0749695 as a positive +-1.7514336 is very positive +-1.710038 the same thing +-1.0927433 a positive thing +-1.2461146 at this point +-0.96956015 offer a point +-2.5941486 to the point +-1.2422879 Another important point +-1.6025639 , what point +-0.6088574 This final point +-0.6088574 My final point +-1.3627045 My point +-0.54122734 , grade point +-2.1505525 This indirectly +-1.5502775 makes it cheaper +-0.9655893 cheaper on taxpayers +-2.0678182 In summary +-0.9694723 So in summary +-0.96829027 positive for both +-0.9687765 bounty is both +-1.806714 life , both +-1.2633739 regard to both +-1.8370996 learn to both +-1.1444503 for success both +-0.9087888 daily lives both +-0.5413157 to juggle both +-2.3522997 for the economy +-2.1579654 and the economy +-0.9621749 reviving the economy +-1.1010312 the local economy +-0.7774487 the general economy +-0.9329424 at how easy +-0.93073195 all too easy +-0.9709693 easy to hide +-2.3999302 able to stay +-1.8412709 difficult to stay +-1.6849324 help them stay +-0.7776412 them stay attached +-1.5671173 provide a backdrop +-1.566702 knowledge of lots +-0.9703093 lots of bare +-0.5417812 of bare facts +-2.4325557 the student obtain +-1.8510447 having to obtain +-1.518836 unable to obtain +-2.0623796 have to obtain +-0.9851498 opportunity to obtain +-1.4366505 which they obtain +-2.2551448 can be converted +-1.5664853 level of wisdom +-0.54174346 newly coined wisdom +-2.548219 in the broader +-1.6838901 in some departments +-1.5867296 to focus strictly +-2.3519025 it is common +-1.9946704 It is common +-1.233163 situation is common +-0.9480326 not very common +-1.4437836 students a break +-0.7055833 a summer break +-2.1251013 their studies -- +-1.5047146 do so -- +-1.0780898 smoking entirely -- +-1.2663965 let their subconscious +-0.5417812 their subconscious processes +-2.3714485 on the information +-0.96412337 schedules with class +-0.9691516 forgo a class +-1.6691142 a full-time class +-1.23075 learned in class +-1.23075 get in class +-0.9504479 actually in class +-1.2962822 going to class +-1.3921611 of my class +-1.3921611 on my class +-0.8868732 split between class +-0.5409623 from pure class +-0.9328868 fulfill our class +-0.5409623 as swimming class +-0.70444465 to miss class +-0.96997774 energy and concentration +-0.96840984 lose their concentration +-1.7840121 They are concentration +-0.95110464 usually help concentration +-1.5423028 Such connections +-0.9472409 with social connections +-0.9671011 connections they ca +-0.54174346 Customers ca +-1.2552633 get from pure +-1.955224 work and association +-1.7395061 students and faculty +-0.9650865 connections can enrich +-0.7055833 also greatly enrich +-1.3615991 of those whose +-1.0254587 to those whose +-0.9145365 The individual whose +-0.9652454 career will ultimately +-1.246715 degree and ultimately +-0.9587474 ability and ultimately +-0.9587474 tournament and ultimately +-1.2445872 or may ultimately +-0.8453008 will ultimately remain +-2.333326 of a company +-0.95380145 with one company +-0.89687794 with bad company +-1.5671173 provide a network +-1.554918 In the end +-1.7512143 in the end +-0.9683048 these students end +-1.702165 those who end +-1.2232125 end up needing +-1.2618173 a work record +-0.970309 record and references +-1.4988186 be good references +-2.1864247 with the individual +-1.5330077 to the individual +-0.9615649 assist the individual +-0.96792346 building their individual +-2.2966485 The individual +-0.88135886 of an individual +-0.88135886 on an individual +-0.88135886 growth an individual +-1.1180985 a responsible individual +-0.8443649 Japan every individual +-0.5413157 academically minded individual +-2.1380718 related to coursework +-1.5898334 student 's coursework +-1.2693326 thousands of applications +-0.84522486 see practical applications +-0.9584591 provides more background +-0.54174346 Practical background +-1.4452091 understanding the principles +-0.9582684 discover that he +-0.9582684 assume that he +-1.2584276 of job he +-1.7381186 however , he +-1.6337612 or not he +-1.2662791 -RRB- and he +-2.020437 , as he +-0.9559923 managerial skills he +-2.0443344 If he +-0.90831107 year before he +-1.2504097 , since he +-0.5409623 the principles he +-2.2904634 job , she +-0.32267326 he or she +-2.0627904 If she +-2.6336813 to the specific +-0.86392206 student eventually wants +-0.7055833 and definitely wants +-1.2705404 independent , mature +-1.2702302 people to mature +-1.7899624 help students mature +-1.647141 we are mature +-0.94460577 mature into functioning +-0.8635808 become productive members +-0.95930976 when all members +-0.88812464 functioning adult members +-0.5415809 Family members +-2.341476 , it instils +-2.7495873 part-time job instils +-2.059538 from the discipline +-2.4179401 the student discipline +-0.9694374 dedication and discipline +-1.6584225 sense of discipline +-1.2058519 of financial discipline +-0.70495963 job instils discipline +-0.7768402 exercised greater discipline +-2.4946728 students to fulfill +-2.0801775 time to fulfill +-1.2552027 required to fulfill +-2.5041761 to the expectations +-0.9672942 fulfill the expectations +-1.3400137 have no expectations +-0.54166937 has realistic expectations +-1.8046691 job and abide +-1.9593399 by the rules +-1.269442 job of filing +-0.5417812 of filing correspondence +-0.81513387 , being organized +-1.1360445 of being organized +-0.97038597 organized and attentive +-0.9709693 attentive to details +-1.4448218 come to realize +-1.683593 help them realize +-1.7691783 in Japan realize +-0.4325418 helps us realize +-1.7781714 dealing with co-workers +-1.438136 realize that co-workers +-2.0077977 with their co-workers +-1.5186512 of my co-workers +-1.803115 job and rely +-2.709341 college students rely +-0.8184666 that co-workers rely +-2.3714485 on the efficient +-1.2597992 this will motivate +-2.2938926 to be diligent +-2.0181718 student is diligent +-2.125482 are not diligent +-1.4138292 at an earlier +-1.5607967 them in age +-0.9692575 day and age +-1.5635303 variety of age +-1.7957847 of college age +-0.94806415 reach college age +-1.5453995 a young age +-0.42364976 an early age +-0.54122734 an earlier age +-0.54122734 the correct age +-2.7515192 part-time job broadens +-1.2654953 studies that interest +-1.2686062 loss of interest +-0.95789003 having more interest +-0.5414925 job broadens interest +-0.5414925 to losing interest +-0.97038597 interest and opens +-1.018674 a wider array +-0.9658522 against job possibilities +-0.93039894 of career possibilities +-0.9509998 explore these possibilities +-0.5415809 with diminished possibilities +-2.1320326 students , yet +-1.566105 skills and yet +-1.6422194 who have yet +-0.9703562 yet a clear +-1.1193492 be made clear +-2.4288554 the student determine +-2.099244 student to determine +-1.2320031 could help determine +-0.77729654 and sometimes determine +-0.97010547 devised a kind +-0.9531605 find some kind +-0.96729845 classmates are kind +-1.3545477 and what kind +-0.90602463 determine what kind +-0.91465205 job he likes +-0.9653139 likes or dislikes +-0.95028013 and which managerial +-1.2693326 issue of finance +-0.9444466 on -LRB- finance +-0.97014666 causes of human +-0.96546555 nature as human +-0.9650539 finance or human +-1.7371128 time and resources +-1.2006366 and other resources +-0.77729654 or human resources +-0.77729654 of economic resources +-1.2375263 student who interns +-1.4138292 at an advertising +-0.5417812 an advertising agency +-0.9708217 surprised to discover +-0.9576497 agency may discover +-1.2273366 study could discover +-0.96005386 has an aptitude +-0.96894807 aptitude for graphic +-0.5417812 for graphic design +-0.9653139 design or copywriting +-1.5426197 Such hands-on +-1.2696514 provides a richer +-1.2947836 an individual basis +-0.54166937 a richer basis +-0.54166937 a temporary basis +-0.96879 basis for planning +-0.9593317 as experience planning +-0.9380174 , financial planning +-0.94108486 career than waiting +-0.96841514 work for around +-1.5255615 the people around +-0.84480584 and focused around +-0.54140407 than waiting around +-0.8446352 I look around +-0.81796503 many places around +-2.5689857 for the rigors +-0.97038597 rigors and dynamics +-0.9709622 personality , confidence +-1.3281894 with little confidence +-1.3049376 and gain confidence +-1.3353155 are too ggreen +-0.9469099 greal world h +-0.5415809 too ggreen h +-0.5415809 gGreat Works h +-1.2597992 experience will familiarize +-0.9599652 example an office +-0.9329424 of how office +-0.970158 levels and hierarchies +-0.9470824 inherent social hierarchies +-0.7054753 how office hierarchies +-2.0152106 they can situate +-2.2413182 as a newcomer +-0.96889466 to for guidance +-0.7055833 in providing guidance +-1.2695947 guidance and mentoring +-0.9710908 homework , writing +-0.5417812 , writing essays +-0.970309 essays and reports +-1.3566514 have been reports +-2.3699582 on the side +-0.8777878 for either side +-0.9559639 student would normally +-2.1185288 to be taken +-1.925961 would be taken +-1.7852325 that are taken +-1.1190692 is best taken +-0.96920437 training is needed +-1.9238652 the time needed +-0.94507897 provide much needed +-0.7773812 in obtaining needed +-1.9037992 , in turn +-0.8186068 could easily turn +-1.566702 lack of sleep +-0.96228844 activities is essential +-1.8443623 which is essential +-1.7616539 is an essential +-1.1881351 , an essential +-0.54166937 of sleep essential +-0.9653765 same job later +-0.9617469 useful in later +-0.9617469 society in later +-1.9414414 to study later +-0.9642324 sooner or later +-1.1924024 successful career later +-1.0621269 they need later +-0.8539518 would need later +-0.5413157 related illness later +-1.2577498 we can actually +-0.9646301 he will actually +-1.2680948 requirements of actually +-2.6473858 students to actually +-1.8311868 spend time actually +-1.8875722 they should actually +-1.7172816 money they actually +-1.2643 college are career-oriented +-0.96857595 seemed that choosing +-1.4408765 helpful in choosing +-1.4623752 By choosing +-0.96644884 clubs have secretary +-1.7386603 become a manager +-0.97063816 Taking the manager +-0.9709622 secretary , manager +-2.2403214 as a treasurer +-2.6843307 , and treasurer +-1.2636962 , for instance +-2.1748796 if they plan +-1.5671173 or a branch +-0.970158 spend and budget +-1.8642823 learn to budget +-0.9577386 club fs budget +-2.4123354 I fd +-1.5618979 as I fd +-0.95859265 conclude by saying +-1.4847877 their lives inside +-2.4149652 should be concentrating +-0.96587366 By not concentrating +-1.2691407 clubs and concentrating +-0.95958614 consider this fact +-1.5522907 with the fact +-2.6481862 of the fact +-0.95348483 the one hand +-0.86297345 experience first hand +-0.86297345 gain first hand +-0.21446338 the other hand +-0.8774103 a second hand +-0.5414925 , kitchen hand +-2.0009396 lead to loss +-0.7775762 smoke causes loss +-0.93486565 in other genuine +-2.4184859 should be engaging +-0.9710474 depth , engaging +-1.6521212 learning to play +-0.9678614 access to play +-0.96005386 play an instrument +-0.9710908 instrument , sports +-1.6390226 job will simply +-1.9701433 , or simply +-0.9460886 sports or simply +-2.212325 There are simply +-0.8445425 be developed simply +-1.4003961 that would simply +-0.84194326 and well-being simply +-0.9710474 wanted , hang +-1.1012999 or simply hang +-1.7784469 a good friend +-1.2595543 up with developing +-0.970158 friend and developing +-1.2597747 spent on developing +-1.2704892 also the danger +-0.9703093 danger of ending +-1.9554842 with a materialistic +-0.5417812 a materialistic inclined +-1.4295079 and with diminished +-0.9703093 possibilities of understating +-0.96308273 restaurants with our +-0.9643498 \ be our +-0.9694397 these , our +-2.0374506 In our +-1.8424957 time in our +-1.6536709 all of our +-0.9690737 attention to our +-0.9605465 lacking from our +-0.94567895 think on our +-1.2217014 Depending on our +-1.7237953 it fs our +-0.9377948 shall make our +-1.0436453 to improve our +-1.0143803 to fulfill our +-0.8609898 focused around our +-1.3762513 All our +-0.54034454 of understating our +-0.8159641 since its our +-0.54034454 also shows our +-0.7776412 as human beings +-1.4330316 first time ... +-1.2159157 parents that we +-1.4663093 something that we +-0.942605 just that we +-0.9625333 what job we +-1.9659576 time , we +-1.4016117 Yes , we +-1.8116045 college , we +-0.9521309 Naturally , we +-0.9521309 Principally , we +-2.0058591 , as we +-0.93123025 the reason we +-0.92807525 most jobs we +-0.92807525 what jobs we +-1.8375211 in school we +-0.9051605 because if we +-0.9051605 customers if we +-1.2101916 jobs then we +-1.7392529 earn money we +-1.3854729 and so we +-1.1269675 from College we +-0.7399982 that when we +-1.0146394 , when we +-0.39226922 do when we +-0.7399982 distractions when we +-0.7399982 lives when we +-1.3966722 on what we +-2.0138113 If we +-1.2597365 I believe we +-1.1566409 , where we +-0.85142356 colleagues where we +-0.9060474 and before we +-0.87342954 through until we +-0.84117985 enslavement system we +-1.8420888 I think we +-1.1394507 college age we +-0.8701487 When we +-0.8597 then everything we +-0.5397277 friends whenever we +-2.1573694 to do sooner +-0.8438521 with this opinion +-2.3583171 to the opinion +-1.2527953 support the opinion +-1.7545108 against the opinion +-2.3051038 The opinion +-0.9139306 my personal opinion +-0.8755835 for my opinion +-0.8211931 is my opinion +-0.68985003 , my opinion +-0.68985003 In my opinion +-0.9055667 in my opinion +-0.68985003 fs my opinion +-0.68985003 voice my opinion +-1.7436155 not to rush +-1.2641429 lives are concentrated +-0.84262145 Without concentrated +-1.6724663 , by its +-1.2518394 is at its +-0.89666134 work since its +-0.8182994 to determine its +-1.2485409 done for us +-1.2485409 paid for us +-0.59801644 job helps us +-0.8445425 also show us +-1.1003087 , let us +-0.8445425 which allow us +-0.8632097 people around us +-0.9564057 full-time study causes +-1.4054912 the main causes +-1.2127519 breathing smoke causes +-0.7776412 of human unhappiness +-2.5689857 for the love +-2.5689857 for the sole +-1.9719723 for a few +-0.9164639 work a few +-0.9508951 making a few +-0.9508951 up a few +-0.9328343 Japan how few +-1.0478809 The last few +-1.3170122 a few bucks +-2.0385978 be a distraction +-0.9657569 creates a distraction +-0.7775762 an obvious distraction +-0.9693766 still in further +-2.5049999 , and further +-0.9656403 distraction and further +-0.77750957 be contributing further +-1.0065018 and further cloud +-2.8185043 of the tricky +-0.5417812 the tricky monetary +-0.5417812 tricky monetary enslavement +-1.7709677 parents and live +-1.7103466 home and live +-1.4444742 choose to live +-2.2821784 students should live +-0.9377899 could well live +-0.95339495 system we live +-1.257133 any student under +-0.86392206 we live under +-0.8186782 that short span +-0.8885845 span between childhood +-2.0379982 order to assure +-0.970158 financial and mental +-0.95829105 assure more mental +-1.9751439 student fs mental +-0.7776412 more mental stability +-0.95834273 him for his +-0.95834273 carrying for his +-1.7307425 time in his +-1.5012834 later in his +-1.3852684 interested in his +-2.5732524 , and his +-1.2413532 course of his +-1.6609588 outside of his +-0.95597637 efficacy of his +-0.9660948 entirely to his +-0.9660948 adjusting to his +-0.94580114 guarantee on his +-0.94580114 out on his +-0.9413197 way -LRB- his +-1.7422053 to pay his +-0.95359665 pains when his +-1.4843823 to support his +-0.84259254 and manage his +-1.0439264 and allow his +-1.0965645 to lose his +-0.54043275 to assert his +-0.54043275 of starting his +-0.54043275 cupidity corrupts his +-0.54043275 generally focuses his +-1.248779 better for her +-0.9598097 critical for her +-1.7416764 some of her +-0.9705882 consequences to her +-0.4536702 his or her +-1.1003903 even lose her +-0.8426969 No deviation +-2.2278736 that it allowed +-1.4039512 should be allowed +-0.96748435 customers are allowed +-1.2677655 day is absolutely +-1.4532362 I am absolutely +-2.7515192 part-time job executed +-0.9444802 under any conditions +-1.4019995 the necessary conditions +-0.9650865 period can influence +-0.8452535 a negative influence +-2.6690228 it is sometimes +-0.9709622 third , sometimes +-0.970158 influence and sometimes +-0.9580144 someone fs fate +-1.2704892 change the destiny +-2.3561552 for the entire +-2.476312 of the entire +-0.96236783 negates the entire +-2.0564725 , for generations +-0.95792127 grandfather fs generations +-2.3173888 The damage +-0.84262145 and consequently damage +-2.2551448 can be irreversible +-0.89710486 A bad qualified +-0.70563835 and productivity causing +-1.018674 the entire nation +-0.9328095 had no direct +-0.8452535 ; result direct +-0.8971298 from possible wastes +-1.5687809 during the production +-1.2314191 the reasons above +-2.2834737 with the above +-1.5679003 on the above +-0.54166937 reasons stated above +-2.3040445 and the institutions +-0.9596274 within all institutions +-0.94110245 it first priority +-2.3320854 on their agendas +-0.8426969 No excuses +-1.3711581 for any delay +-1.566702 lack of action +-2.2551448 can be accepted +-1.6502354 and that right +-1.4414201 has a right +-1.8705802 the job right +-1.8427693 that the right +-1.9403173 on the right +-1.1076039 have the right +-1.3569758 make the right +-0.939197 find the right +-0.939197 attain the right +-0.939197 balance the right +-0.9692575 holy and right +-0.96776146 nonetheless their right +-0.95122325 for education right +-0.7766882 is truly right +-0.9674512 and be assured +-0.7776412 a perfect self-improvement +-2.5330536 is a worthwhile +-2.0981677 It serves +-1.5472735 an important element +-1.388338 most important element +-1.5687809 during the preparatory +-1.8598219 at a stage +-1.7273047 an important stage +-0.5415809 the preparatory stage +-0.5415809 most curious stage +-0.96292967 cities from country +-0.95380145 perhaps one country +-0.54166937 health conscious country +-0.96952105 country in Asia +-0.5417812 in Asia whereby +-0.8778863 feel obliged +-1.1341197 become financially self-supporting +-1.2701352 earn , save +-2.6527693 , and save +-1.5419656 work to save +-1.9566098 order to save +-2.043514 them to save +-1.717955 money they save +-0.8962238 most families save +-1.1180675 to both save +-0.97038597 save and augment +-2.2403214 as a whole +-2.1495302 This whole +-1.5464252 for money among +-1.1336819 and independence among +-0.54166937 considerable resistance among +-1.2637146 let the youth +-1.2637146 among the youth +-2.3173888 The youth +-2.025229 job in his\/her +-1.9981146 part of his\/her +-1.797103 to make his\/her +-1.1927737 , support his\/her +-0.8635832 could reduce his\/her +-2.3423157 , it promotes +-0.97038597 tasks and duties +-0.5417812 and duties assigned +-0.97076845 extol the virtues +-0.95723146 teach important virtues +-2.034237 such as hard-work +-0.96394736 hard-work , teamwork +-0.96394736 communication , teamwork +-0.96394736 groups , teamwork +-0.96516645 bonus with respect +-2.6689322 , and respect +-1.8633696 learn to respect +-0.5415809 the much-needed respect +-1.2667356 respect for authority +-1.8046691 parents and authorities +-2.314604 in their schools +-1.8578542 at a local +-2.4339757 in the local +-2.0888412 at the local +-0.9696175 schools and local +-2.00426 with their local +-1.2590523 or as local +-1.4168838 working at local +-0.8778697 and local governments +-1.6503577 must be proactive +-1.2694414 guidance and setting +-0.9555825 fact when setting +-1.5359151 up the appropriate +-0.9623027 selected the appropriate +-0.9623027 considering the appropriate +-1.2490816 , an appropriate +-1.609801 of this policy +-1.6607387 such a policy +-0.5481489 the appropriate policy +-0.9469466 restaurant smoking policy +-1.0482398 appropriate policy measures +-0.9710474 clicks , avoid +-0.9709195 measures to avoid +-0.95353854 avoid some pitfalls +-1.7264895 people , including +-1.2651393 countries , including +-1.5880744 living expenses including +-0.54166937 some pitfalls including +-0.8186782 pitfalls including exploitation +-0.97038597 exploitation and abuse +-2.133677 students , excessive +-2.6879861 , and absenteeism +-0.97038597 absenteeism and gross +-0.5417812 and gross negligence +-2.4458141 of their schooling +-0.93476766 from our schooling +-1.4059235 The main purposes +-0.54174346 for relaxation purposes +-1.1786687 to : i +-0.9473718 i -RRB- attain +-1.5951159 the right competency +-1.5667008 skills and attitude +-1.2662591 change their attitude +-2.296603 to be desired +-2.2828546 of their desired +-2.1671886 to their desired +-0.9447781 their desired expertise +-1.5704043 and , ii +-0.9473076 ii -RRB- finish +-1.5568607 after they finish +-0.96552175 also not lose +-1.555649 only to lose +-0.96748304 nothing to lose +-0.957354 She may lose +-1.3248731 or even lose +-0.8774092 perhaps even lose +-1.0774819 and eventually lose +-0.8777878 not lose sight +-0.8885056 lose complete sight +-1.2352358 the potential gains +-1.4446229 outside the four +-2.3120105 to their four +-0.9511949 In these four +-0.777672 the four corners +-2.6509278 of the classroom +-1.2638448 between the classroom +-0.54174346 Paul h. Thus +-2.0856242 be a balance +-2.0942926 that the balance +-1.8477062 having to balance +-1.9105492 order to balance +-1.7677734 learn to balance +-1.2437649 try to balance +-0.94023085 one must balance +-0.9463261 of social balance +-0.9576286 \/ life balance +-1.5908618 the right balance +-2.04247 is not necessarily +-1.7212825 are many points +-0.88858426 with three points +-2.814478 of the argument +-0.87785065 the second argument +-1.1395903 the end unless +-2.2427309 and the decision +-0.9674895 then the decision +-1.4452091 are the attainment +-1.5675402 development of organizational +-1.97698 student fs organizational +-1.7390541 students and socially +-0.84262145 to communicate socially +-1.2697871 between the current +-1.663687 available to current +-2.3089337 in their current +-2.309394 The current +-1.6771647 students may consist +-2.0241 This is mainly +-0.54174346 may consist mainly +-1.9859949 job is fast +-0.97014666 mainly of fast +-0.77750957 ft moving fast +-0.9653139 food or manual +-1.7096956 students with invaluable +-0.95539755 effectively are invaluable +-1.40072 skills are invaluable +-0.9324878 would provide invaluable +-0.77729654 are gaining invaluable +-1.9779186 student fs targeted +-0.96629435 Balancing work schedules +-0.9650539 worries or schedules +-0.92224 with class schedules +-1.2695947 income and arranging +-0.970309 planning and effective +-1.197765 they provide effective +-1.6341007 students can assist +-1.7369179 that will assist +-0.94658256 money will assist +-2.6527693 , and assist +-2.473131 able to assist +-2.2016408 time jobs assist +-1.8885157 they should assist +-0.9691483 individual in becoming +-1.2684236 growth and becoming +-1.4423964 risk of becoming +-1.4444742 up to becoming +-0.7771444 is fast becoming +-2.797555 of the daily +-1.2598424 tuition and daily +-0.9654619 rent and daily +-1.5564498 in their daily +-1.5161794 to enjoy daily +-0.8885696 lives both presently +-1.2262268 order to prepare +-1.5569246 trying to prepare +-0.9514089 as help prepare +-1.7985142 them for car +-2.5939465 have a car +-1.2702949 they start post +-0.7055833 for car post +-0.90965796 post graduation careers +-2.1653402 in their careers +-0.9594985 entering their careers +-0.89685124 in professional careers +-2.061142 not be passed +-2.2536452 can be extremely +-0.9433873 workloads become extremely +-2.1214323 will be impressed +-0.96166855 impressed at interviews +-1.4454614 come to cities +-0.54174346 often multicultural cities +-0.7776412 from lower socio-economic +-0.5417812 lower socio-economic backgrounds +-1.9605794 need to partly +-1.4303986 that can complete +-2.5104034 is a complete +-0.9700162 represent the complete +-2.1553102 time , complete +-0.9677393 lessons , complete +-2.1749327 time to complete +-0.8768611 can actually complete +-1.0997503 even lose complete +-0.93832445 their society professionally +-1.7514766 as an educated +-1.8890293 also be contributing +-0.9709622 responsible , contributing +-0.970158 confident and contributing +-1.3583838 much better prepared +-2.472632 for the challenges +-2.6509278 of the challenges +-1.8459611 after college ends +-1.6341007 students can develop +-1.8584001 students to develop +-2.3887694 able to develop +-2.70212 college students develop +-0.95771194 part-time also develop +-1.8326062 and they develop +-1.2578272 company or develop +-0.9539341 make one stronger +-1.2350935 to develop stronger +-2.2082727 time , appropriately +-0.5417812 , appropriately dressed +-0.91877997 are real consequences +-0.8778697 to her actions +-1.5464375 which can wait +-0.9709195 than to wait +-1.8017014 to make naive +-0.5417812 make naive mistakes +-1.2480236 during this critical +-2.0517251 that is critical +-0.97014666 activities of critical +-1.2678738 point in favor +-1.8589714 with a variety +-1.2467027 through a variety +-0.95874107 develop a variety +-1.0185608 a wider variety +-0.9687079 know their classmates +-0.7775762 there again classmates +-2.0007648 , but maybe +-1.8329598 spend time together +-2.3040137 to work together +-0.70521736 time communicating together +-0.5414925 Living together +-0.5414925 possibly gather together +-0.970309 living and thinking +-1.5664853 way of thinking +-1.3737603 is much closer +-1.849453 to get along +-1.1755809 is my belief +-1.3322551 support my belief +-2.661735 students to recognize +-0.962298 universities should recognize +-0.9653157 flexible with lectures +-0.9693766 awake in lectures +-0.97014666 content of lectures +-1.270913 lectures , meetings +-0.8186068 after hour meetings +-0.96313107 students from poorer +-1.0553755 at a disadvantage +-0.8426969 a disadvantage compared +-1.25513 people from wealthy +-1.4357537 parents are wealthy +-0.964825 owners can pass +-1.2705404 lives , pass +-1.9515113 work and pass +-0.970705 families to pass +-2.3513172 with the ever +-1.2274852 who has ever +-0.81495047 society without ever +-0.81495047 education without ever +-0.9710474 interview , experiencing +-1.0063916 without ever experiencing +-1.7397777 working in regular +-0.96546555 health as regular +-0.7054753 ever experiencing regular +-2.0385296 not have empathy +-1.4202173 people in modern +-0.96229005 common in modern +-2.3173888 The modern +-0.96952105 also in politics +-0.97038597 balance and justice +-0.9488822 Universities could award +-0.5417812 could award credits +-1.4141771 who do volunteer +-2.6336813 to the all-round +-0.9623525 therefore should count +-0.8636325 and especially towards +-1.0921655 , particularly towards +-0.70534635 their attitude towards +-0.5415809 should count towards +-0.96877724 towards their degrees +-0.96934706 School is expensive +-1.7506906 is very expensive +-0.96546555 living as expensive +-0.962466 and is costly +-0.962466 food is costly +-0.96116084 other jobs assisting +-1.9483092 For busy +-2.3839705 they are busy +-0.54166937 jobs assisting busy +-0.7776412 assisting busy professionals +-1.2015057 or other sources +-0.96559507 however I doubt +-1.3404553 is no doubt +-1.558634 and I spent +-1.2246728 what I spent +-2.3985302 should be spent +-0.96517015 time not spent +-1.8153983 the time spent +-1.2321516 much time spent +-2.120794 in college spent +-1.3554705 much better spent +-0.93199706 the years spent +-1.2695947 up and figuring +-1.2632909 always be personally +-1.6570313 what is personally +-0.8883211 , both personally +-0.9708349 Although the usual +-1.2118845 very valuable form +-2.5320156 in the form +-1.6820555 in some form +-1.2002534 any other form +-0.5414925 spontaneous feedback form +-0.96873605 if that profession +-2.0572884 would be nice +-2.5939465 have a nice +-1.2707573 way to speed +-1.593561 can get certified +-0.970309 certified and receive +-0.96781194 effort to receive +-0.96781194 fortunate to receive +-0.8970103 their degree faster +-1.0064437 the expectations faster +-1.5693139 means to raise +-1.1542566 for their children +-2.1085467 of their children +-0.94568 raise their children +-2.309394 The children +-1.2122273 for his children +-0.5415809 no longer children +-1.8898462 having to choose +-1.2544696 freedom to choose +-1.4214691 right to choose +-2.1710527 if they choose +-0.9534383 customers who choose +-0.9554413 life would choose +-0.9139306 n't always choose +-2.1568217 them to interact +-0.9709622 membership , community +-0.9650539 club or community +-0.8776273 their local community +-0.8453008 them various epeople-skills +-1.8562675 I had +-1.3326421 that I had +-1.4177648 when I had +-1.2578863 but not had +-1.438929 college and had +-2.505258 college students had +-0.95726424 few students had +-0.9463543 at but had +-0.9649056 from college had +-0.96510494 cases have had +-1.260992 since they had +-0.9524287 co-workers who had +-0.9427098 I myself had +-0.8867519 efforts someone had +-1.6340977 I was 14 +-0.9583013 fre also old +-2.156481 students are old +-0.93270886 14 years old +-0.8778697 a local pharmacy +-1.5470109 more than capable +-0.9599652 : an appreciation +-0.54174346 Early appreciation +-1.8506978 time is required +-0.8635832 necessary level required +-0.96711266 hours are required +-1.1811657 the effort required +-0.70521736 is absolutely required +-0.95859265 school by interacting +-2.280572 with the public +-2.463656 for the public +-0.9692625 non-smokers in public +-0.96997774 private and public +-1.828603 in all public +-2.0708947 how to juggle +-0.94811535 was very surprised +-0.94110245 I first came +-1.2591032 student can bring +-0.9671011 dedication they bring +-0.95854473 bring more maturity +-0.9702419 coin a currently +-2.3839705 they are currently +-1.7702258 in Japan currently +-1.2672445 opinion is purely +-0.89666134 is just purely +-0.5415809 time devoted purely +-0.5415809 such facilities purely +-1.2641429 reasons are 1 +-1.4133956 learn many interesting +-1.3406804 is no replacement +-1.5692856 world , 2 +-2.017308 students have 2 +-1.8953348 pay for everything +-1.4441268 used to everything +-0.94361335 taken-on then everything +-0.96582973 or have everything +-2.055777 If everything +-0.77699226 given almost everything +-0.96920437 everything is given +-1.7840121 They are given +-2.06044 If given +-1.3555913 have been given +-0.970158 hiring and 3 +-0.8424733 the next 3 +-1.851822 job that relates +-2.594901 have a salary +-0.92716146 on a resume +-1.2605371 out a resume +-0.8186068 very short 4 +-1.5679898 friends and mentors +-1.2597992 I will explain +-1.4364045 job to survive +-2.401189 able to survive +-0.97041446 often a struggle +-2.3556416 with the continual +-1.7212825 are many excellent +-1.9542363 is an excellent +-2.2748258 with the long +-1.5531294 during the long +-1.997152 will have long +-0.96692693 days are long +-0.9577698 make life long +-0.86343884 negatively impact long +-0.54140407 Before long +-1.1014369 the long length +-0.97025436 length of vacation +-1.1012999 the long vacation +-1.7844788 could be greatly +-1.246169 would also greatly +-1.3617146 is often wasted +-0.7055833 be greatly wasted +-0.94811535 become very lazy +-0.970309 lazy and unproductive +-2.0009396 lead to unproductive +-0.86392206 , late nights +-1.3168118 a few nights +-1.8995838 pay for travel +-1.5481734 a young persons +-0.9573025 individual fs growth +-0.9139306 and personal growth +-0.7771444 fs mental growth +-0.5414925 young persons growth +-0.70521736 and sexual growth +-0.9410907 and often multicultural +-0.970158 on and off +-0.9381157 cases well off +-0.77750957 approach paid off +-1.3562273 the statement eIt +-0.97038597 personal and employability +-1.71038 the same field\/industry +-2.472632 for the ereal +-2.4508793 in the ereal +-2.1287076 are not suited +-2.7040782 I look +-1.4217988 be to look +-1.858489 chance to look +-1.6262139 had to look +-2.283659 students should look +-0.90357953 -RRB- employers look +-2.630871 to the path +-0.9306954 different career path +-2.0118377 with their homework\/assignments +-0.94110227 have valuable input +-2.6689322 , and using +-2.1901503 value of using +-2.0306695 such as using +-2.379855 they are using +-0.9687079 using their initiative +-1.702908 their own initiative +-0.8778697 , problem solving +-0.9710908 solving , communication +-2.3548498 for the customer +-2.368377 to the customer +-0.9623027 onto the customer +-1.445843 teamwork , customer +-2.0118377 with their uniforms +-1.6173695 working part-time brings +-1.270913 instance , brings +-2.4202912 should be pointed +-1.4347692 jobs provide real-world +-1.5466692 which can complement +-0.9698608 fs the spending +-0.96499443 from not spending +-1.7970217 studies and spending +-1.419718 some extra spending +-0.94248736 or out spending +-1.6956787 their own spending +-0.8176309 attitude towards spending +-0.54122734 towards unreasonable spending +-2.512545 it is certainly +-1.9664009 there is certainly +-0.9650915 which will certainly +-0.9708606 weekends , certainly +-1.4048691 a degree certainly +-0.9672094 certainly be welcome +-2.2383718 as a welcome +-1.2947335 is always welcome +-0.96996903 given a choice +-2.1427934 This choice +-1.1635808 in personal choice +-1.1184012 The best choice +-0.70521736 the wise choice +-0.95028013 work which complements +-0.9710908 times , looking +-1.43103 for work within +-2.375778 they are within +-0.8182215 proper burden within +-0.84479624 communicating together within +-0.9467725 Banning smoking within +-1.2666297 required for lab +-1.2695354 within a lab +-2.2383718 as a research +-2.106554 field of research +-1.8073757 A research +-0.7776412 a research assistant +-2.0645247 to a professor +-1.7315739 by their professor +-2.065188 to a well-paying +-0.97038597 well-paying and conveniently +-0.5417812 and conveniently located +-1.2704892 enrich the content +-1.2637693 even be desirable +-2.058414 not be interested +-0.96857595 not that interested +-2.125482 are not interested +-0.94453686 studies -LRB- although +-1.7420774 , this scenario +-0.5417812 this scenario leads +-0.7776412 to certain existentially +-0.5417812 certain existentially themed +-0.5417812 existentially themed questions +-0.96602863 does it profit +-1.1560407 can still profit +-1.7395061 time and prioritize +-1.7370406 There is merit +-0.95346725 has some merit +-2.2595627 is a significant +-2.1175108 , a significant +-1.2468128 puts a significant +-0.92224765 very high expense +-1.0183387 a significant expense +-1.3813438 a large expense +-1.9593399 have the capital +-1.2653071 materials , thereby +-0.96822786 products , thereby +-2.509753 , and thereby +-1.7131681 education and thereby +-1.7390541 education and cover +-1.8978041 and to cover +-2.1297407 have to cover +-1.8112383 money to cover +-1.2696514 out a bank +-0.96394134 sponsored student loan +-1.2691407 government and loan +-0.54166937 a bank loan +-0.94553554 cases even mortgaging +-0.96877724 mortgaging their house +-2.677245 , and puts +-2.147531 This puts +-0.7054753 educational institutes puts +-1.862575 amount of strain +-0.96550435 families with financing +-0.9708606 support , materials +-1.4426589 books and materials +-1.2418526 or study materials +-0.4288826 and educational materials +-1.2653924 argument , reducing +-0.96827096 thereby , reducing +-2.3195124 , the pressure +-1.253769 ease the pressure +-1.253769 reducing the pressure +-1.8046691 parents and encouraging +-1.1334231 , a large +-1.246229 believe a large +-1.246229 puts a large +-1.2240174 of most large +-0.9330775 public at large +-0.9330775 population at large +-0.70534635 the relatively large +-0.7949131 a large percentage +-1.269442 percentage of shoppers +-1.6657484 to take notice +-0.97025436 notice of changes +-1.1841569 work environment changes +-0.96952105 changes in fashion +-0.97038597 fashion and technology +-1.2588686 we can buy +-1.9171195 want to buy +-1.836188 not only buy +-2.6509278 of the latest +-0.9674895 buy the latest +-0.8426969 the latest products +-1.0065018 , thereby reviving +-0.8778697 at local shops +-1.9996837 may be low +-1.4445101 take a low +-0.96934706 cost is low +-1.5868083 , so shop +-0.5417812 so shop owners +-0.7055833 these savings onto +-0.92561936 then going onto +-1.9922076 important to me +-0.8958022 has benefit me +-0.8440098 simply allow me +-0.8871436 also made me +-0.8626537 look around me +-0.7767397 and puts me +-0.541139 that enabled me +-0.541139 jobs provided me +-0.541139 it gave me +-2.1571712 , it seems +-0.94952345 nowadays it seems +-2.0975175 It seems +-1.5475363 This will carry +-1.2694414 others and carry +-0.8453008 out various task +-0.97010547 accomplish a goal +-2.309394 The goal +-0.34232748 the primary goal +-0.34232748 The primary goal +-0.5415809 was original goal +-1.7834284 could be rewarding +-1.3800496 be very rewarding +-1.2112899 is quite rewarding +-0.9314439 not to mention +-0.970309 future and seeing +-1.5481491 based on seeing +-0.9709693 sense to payback +-0.96997774 pastime and over +-1.637473 a little over +-0.70534635 much preferred over +-0.70534635 will carry over +-1.221502 work there everyday +-0.94453824 over into everyday +-1.0482398 will show irresponsible +-1.0784745 and eventually breakdown +-1.6168926 in this economic +-0.97063816 breakdown the economic +-1.4431605 terms of economic +-1.9028492 which is happing +-2.5330536 is a false +-1.078693 the credit card +-0.96771944 companies are constantly +-0.5417812 are constantly pushing +-1.0181831 to take care +-0.81864053 are taken care +-2.8185043 of the bill +-0.8309274 should be banned +-1.5623559 smoking is banned +-0.9093544 cards were banned +-1.2445077 have already banned +-2.144209 for a number +-1.6386443 are a number +-1.2697871 reducing the number +-1.3807995 a large number +-0.38064998 the total number +-1.1501412 student who goes +-0.9063158 anyone who goes +-1.1841569 smoky environment goes +-1.2261672 if you didn +-0.84522486 in University didn +-0.97041446 pay a dime +-0.8453008 their University tuitions +-0.9703562 following a summer +-1.5622797 during their summer +-0.70563835 their summer vacations +-2.548219 in the summers +-0.95623523 a study schedule +-1.5726613 , work schedule +-0.9506409 firm work schedule +-1.838337 the real schedule +-0.92209035 full-time class schedule +-1.3805264 job I got +-2.1558096 , I got +-1.0512629 work after graduating +-0.7960496 got after graduating +-0.7960496 teenagers after graduating +-1.9458301 , is already +-1.454685 I have already +-1.6400855 they have already +-0.90340173 Japan have already +-0.90340173 US have already +-0.8424733 I fd already +-0.86400133 fd already experienced +-1.9294705 by the requirements +-2.6509278 of the requirements +-0.96203786 waste money c +-0.9382523 so well c +-1.4349594 only be seen +-0.54174346 be plainly seen +-1.4387722 look for extra-curricular +-0.9708606 hobbies , extra-curricular +-0.97001797 impact of extra-curricular +-1.2006366 any other extra-curricular +-0.90981555 activities before hiring +-2.4549673 in a bubble +-0.9039227 however upon leaving +-1.2707431 debt , leaving +-1.376193 job after leaving +-2.5939465 have a head +-1.97698 student fs head +-0.9692625 start in creating +-0.96997774 awareness and creating +-1.5498371 or even creating +-0.9442476 energies into creating +-2.8185043 of the failures +-1.3828995 that has realistic +-0.96671176 easier time adjusting +-1.7198843 of work \/ +-2.552962 for the employees +-1.1711544 of restaurant employees +-2.4938304 to have employees +-1.5669839 These employees +-0.81838614 from fellow employees +-0.96873605 employees that wont +-0.5417812 that wont burn +-1.2707573 way to gauge +-0.95859265 is by assessing +-1.5693139 ability to multi-task +-0.9710908 know , grade +-0.88859946 grade point averages +-0.96644884 employers have devised +-1.6617393 kind of formula +-0.7776412 The employer feels +-2.6879861 , and excels +-0.96005386 complete an assignment +-0.8186068 h Finally +-2.5418718 in the general +-1.4408765 people in general +-2.677245 , and general +-0.5417812 Idle minds +-2.4372787 the student deserves +-0.96894807 income for needy +-0.95398855 families who face +-1.270097 when the difficulties +-1.2691181 because of difficulties +-0.9380174 face financial difficulties +-0.8453008 and basic necessities +-0.9710474 five , purchase +-1.7912338 help students purchase +-1.9606785 work , dedication +-0.970309 focus and dedication +-0.8186782 especially towards unreasonable +-0.9708349 recognize the inherent +-1.5699025 These structures +-0.96873605 structures that exist +-1.2702949 they start formal +-0.8186068 and using formal +-1.0065018 and thereby adjust +-2.4867222 able to demand +-2.330938 part-time jobs demand +-0.70563835 jobs demand involvement +-0.9579728 or my private +-0.54174346 Both private +-0.97025436 are of utmost +-0.9687079 do their utmost +-2.0678182 In New +-1.2677768 student in New +-0.34242433 In New Zealand +-0.34242433 in New Zealand +-1.7298989 jobs that enabled +-2.498766 to have continued +-1.2672445 only is tertiary +-0.9582222 required by tertiary +-1.1748779 with my tertiary +-1.1748779 all my tertiary +-0.8422966 Attending tertiary +-0.9580532 pay my monthly +-0.96894807 rent for housing +-1.2667356 common for teenagers +-0.9709693 university to move +-1.961418 and to spread +-0.96877724 spread their wings +-1.3800086 job I wanted +-1.2258196 whatever I wanted +-0.9478791 whatever you wanted +-1.1888787 never really wanted +-1.961418 and to prove +-1.4436442 care of yourself +-0.9615076 jobs at entertainment +-1.1010312 or simply entertainment +-0.8184666 expenses including entertainment +-0.8186782 a government sponsored +-1.3973616 , which covered +-1.8825274 , there wasn +-1.9997865 will be left +-2.200765 should be left +-1.4194937 enough money left +-2.2413182 as a gas +-0.5417812 a gas station +-0.54174346 gas station attendant +-0.54174346 , parking attendant +-1.2709996 attendant , kitchen +-1.5696235 hand , video +-0.97025436 periods of video +-0.70563835 , video rental +-0.5417812 video rental clerk +-0.9710908 clerk , parking +-1.4289291 When I finally +-0.9708606 concentration , finally +-0.96997774 waiter and finally +-0.9142505 before he finally +-0.70563835 a hotel porter +-1.2514111 of jobs provided +-0.9397345 developed better communicative +-2.0708947 how to organize +-0.95854473 finances more efficiently +-0.96394134 contribute student opinions +-1.389102 in these opinions +-0.59827137 have three opinions +-1.5161182 to gain hands +-1.3259969 to obtain hands +-1.5668873 from a textbook +-2.814478 of the textbook +-2.4372787 the student selected +-0.9566213 All study programs +-1.8101084 Therefore , seeking +-2.1597657 students are unsure +-0.97038597 unsure and undecided +-1.2638448 choosing the direction +-0.9674895 discovering the direction +-0.95854473 offer more options +-2.2198038 for a brighter +-0.9488822 job could inspire +-1.8887631 is to excel +-2.5889456 students to excel +-0.96877724 until their senior +-1.8654387 difficult to comprehend +-1.8556154 time in discovering +-0.97041446 last a lifetime +-1.435987 parents are helping +-2.001421 lead to unmotivated +-1.3406804 is no sadder +-0.5417812 no sadder story +-1.7334363 college is finished +-1.3823861 that has finished +-2.1864977 they have finished +-0.7776412 has finished 16 +-1.5693139 only to run +-1.9197172 job can challenge +-0.91869646 The real challenge +-2.0385296 not have tried +-1.2468667 also , dealing +-2.0381393 time , dealing +-1.7842045 -RRB- , dealing +-1.4103463 teamwork , dealing +-0.95903134 first experience dealing +-2.0291033 such as dealing +-0.8634614 , especially dealing +-0.8447202 even though dealing +-1.1517396 with a boss +-1.4095671 job also shows +-0.93289757 are no longer +-0.9040364 it upon ourselves +-0.7775762 fs ask ourselves +-1.9360528 not be taught +-1.6508119 should be taught +-2.094723 be a special +-1.2695947 people and colleagues +-2.4946728 students to grow +-0.9631001 continues to grow +-0.9631001 hurry to grow +-1.2232125 grow up quicker +-1.2105547 learn about glife +-0.968572 encourage their kids +-1.2947651 Most kids +-0.93262225 my own kids +-0.9708349 reach the correct +-1.8046691 parents and grandparents +-1.7016098 Many youngsters +-1.408056 today fs consumerist +-1.7034411 their own cell +-0.5417812 own cell phones +-0.5417812 cell phones funded +-0.96952105 always in possession +-0.8426969 the latest games +-0.970309 games and clothes +-0.34240606 , fancy clothes +-0.34240606 my fancy clothes +-1.2695354 earning a wage +-0.7055833 , minimum wage +-1.2881583 for example communicating +-1.8815657 of time communicating +-2.618974 to the groups +-1.2418526 or study groups +-1.6854635 and social groups +-0.9036761 of age groups +-1.2246553 into social etiquette +-0.87785065 , business etiquette +-0.70563835 using formal spoken +-0.5417812 formal spoken language +-1.636735 , I wish +-1.5611119 as I wish +-1.3784518 if they wish +-1.2362063 If they wish +-1.702165 those who wish +-1.9151727 with a teacher +-1.7133067 become a teacher +-2.4549673 in a nursery +-2.217416 , or cram +-1.9491554 work in tourism +-1.5335443 work at hotels +-0.8778697 as local tour +-0.5417812 local tour guides +-1.5799317 on the weekends +-2.313727 The weekends +-0.9615076 guides at weekends +-1.7518127 is very challenging +-0.8885696 many companies receiving +-0.54174346 companies receiving thousands +-0.54174346 alone kills thousands +-1.5687809 be the key +-0.9710908 contacts , leading +-1.5693139 reasons to join +-1.5469642 knowledge of attending +-1.5469642 instead of attending +-0.9554702 stage when attending +-1.2831796 while still attending +-1.5630314 them in obtaining +-0.9470944 fs world obtaining +-0.9252627 family without obtaining +-1.5670048 discipline and time-management +-1.9554842 with a semi-regular +-0.96646357 semi-regular work regimen +-2.069276 from the oppressive +-0.9710908 oppressive , thought-controlling +-0.5417812 , thought-controlling atmosphere +-0.9602795 meet people accustomed +-0.7776412 a welcome shock +-2.6336813 to the cloistered +-1.6622562 such a fascinating +-0.94724894 fascinating world exists +-0.84262145 the challenges exists +-1.2704892 where the recommended +-0.5417812 the recommended norms +-0.9703093 norms of contemporary +-0.93273556 contemporary university thought +-0.7054753 Without concentrated thought +-0.7774487 of critical thought +-0.7776412 university thought patterns +-2.101684 do not apply +-2.3892295 they are mocked +-0.96563625 mocked as absurd +-0.9710908 well , occupying +-0.5417812 , occupying oneself +-0.90404415 an outside activity +-0.8186092 other extra-curricular activity +-0.7054753 in physical activity +-0.96646357 as work restricts +-0.7348234 the bad habits +-0.7348234 develop bad habits +-0.94670403 develop social habits +-0.8181321 poor sleeping habits +-0.8447202 few healthy habits +-0.70521736 their recreational habits +-0.86400133 social habits deleterious +-0.7776412 activities include extended +-0.5417812 include extended periods +-0.70563835 of video game +-0.5417812 video game playing +-1.5697957 hours , lounging +-1.8189086 one fs sofa +-0.5417812 fs sofa watching +-0.5417812 sofa watching daytime +-0.5417812 watching daytime television +-0.9710908 television , indulging +-0.96952105 indulging in repeated +-0.5417812 in repeated bouts +-0.9703093 bouts of binge +-1.259926 drinking with equally +-0.5417812 with equally dissolute +-1.4138292 and an obsessive +-0.5417812 an obsessive devotion +-0.9708217 devotion to online +-0.9433175 from learning online +-0.54166937 Playing online +-0.7776412 to online fleshpots +-0.5417812 online fleshpots promising +-1.2694414 social and sexual +-0.54174346 fleshpots promising sexual +-0.70563835 promising sexual release +-0.96952105 release in exchange +-1.2678866 day is split +-0.8885696 It therefore behooves +-2.663024 students to arrange +-1.2610925 does not bear +-0.5417812 not bear mentioning +-1.2195952 without any REAL +-1.6471208 all the efforts +-1.554918 appreciate the efforts +-1.8488582 from their efforts +-0.54166937 , anti-smoking efforts +-0.9434717 Another good life-lesson +-0.9447781 will hopefully tie-in +-1.828508 well as failure +-0.93289024 our own disciplines +-0.9709693 disciplines to achieve +-2.8185043 of the greatest +-0.5417812 the greatest assets +-1.3262113 to obtain control +-1.2694414 needs and desires +-1.8182739 one fs desires +-1.9617186 young people depend +-1.5951159 the right track +-1.2373589 are so spoiled +-0.9513582 that these spoiled +-0.86376655 have everything paid +-1.2449541 have already paid +-0.8184666 This approach paid +-0.9708349 acquire the much-needed +-1.8189086 one fs self +-1.2483442 during this transitional +-2.531173 is a transitional +-1.2552633 time from dependence +-1.6384287 and I shall +-1.2370702 where we shall +-0.54166937 of reckoning shall +-0.62610734 support my claim +-2.3714485 on the domestic +-0.5417812 the domestic front +-0.8778697 person generally leaves +-0.9708349 leaves the security +-1.3220453 the family unit +-1.6609963 life and begins +-0.7832701 , he begins +-0.7832701 as he begins +-0.8184666 real schedule begins +-1.44561 begins to strike +-1.2696514 during a course-load +-1.0481186 to encourage thoughts +-1.2794791 to explore thoughts +-1.269442 thoughts of pecuniary +-1.7390541 education and changing +-0.94100964 support his changing +-0.9671011 be they hobbies +-0.92229426 enjoying your hobbies +-1.44561 begins to assert +-1.269442 thoughts of starting +-0.96544003 course will vary +-0.9710908 dating , courting +-0.9710474 networking , meeting +-0.8777878 and generally meeting +-0.70563835 generally meeting viable +-0.96550435 people with whom +-0.97025436 means of supporting +-0.922282 modest income supporting +-1.2317078 the reasons stated +-0.8778697 are generally fresh +-2.065188 to a firm +-1.5613524 working for superiors +-0.9434717 all good life-skill +-0.5417812 good life-skill enhancements +-0.96563625 things as filling +-2.347839 a job interview +-0.9708349 experiencing the joys +-1.5687809 as the pain +-1.4116323 of being rejected +-1.9491554 work in cooperation +-0.97038597 low and humble +-0.9514796 perhaps help relieve +-0.9703093 expense of funding +-0.9694666 drinking is involved +-0.7055833 for everyone involved +-2.401044 , I emphatically +-0.9583723 this life style +-0.9676401 these are individuals +-1.5695199 These individuals +-0.94460577 entering into adulthood +-1.9548753 is an indispensable +-0.8186782 would highly recommend +-1.2492541 provide an opportune +-1.2197931 the young men +-0.9652262 men or women +-0.9584591 of more women +-1.2232125 make up THEIR +-0.5417812 up THEIR interpretation +-2.5330536 is a wonderful +-0.95398307 yet what meaning +-2.1276162 are not applied +-0.9652262 learned or applied +-0.9572789 what skills exactly +-1.4184357 working at Dairy +-0.5417812 at Dairy Queens +-2.1237645 money and handling +-0.96203786 as money handling +-1.7034411 their own accounts +-0.9597012 people all add +-0.96602863 when it comes +-0.89706135 With freedom comes +-1.9774289 that is indeed +-1.8872964 , is indeed +-0.9710474 Students , indeed +-1.2653924 nature , reduces +-0.96827096 definition , reduces +-0.9146115 a personal note +-2.7495873 part-time job throughout +-0.7775762 and off throughout +-0.9710908 better , exercised +-2.6254084 to the greater +-0.9452802 in much greater +-0.54166937 , exercised greater +-0.88859946 part-time helps defray +-0.9708349 defray the ever-increasing +-2.509021 time job creates +-0.8452535 college debt creates +-0.97038597 stress and burdens +-0.968682 burdens that newly +-0.9694666 how is newly +-0.70563835 that newly minted +-1.5294753 some cases eliminating +-0.7776412 and mental state +-1.2703568 among the population +-0.7775762 the greater population +-1.9073775 students in contact +-1.6119378 of all walks +-1.2704892 or the myriad +-0.8885845 takes place cwhich +-0.9710908 place , isn +-2.712588 I couldn +-0.8778697 really quite shocked +-2.498766 to have perfected +-1.8046691 parents and begging +-0.96602106 if not outright +-1.4135773 and an outright +-0.70563835 not outright demanding +-2.2182844 that they shell +-0.94353324 shell out outrageous +-1.0063916 even small amounts +-0.54174346 out outrageous amounts +-1.2616659 You have nothing +-0.92816144 fs really nothing +-0.92226976 budget : wrong +-0.7055833 really nothing wrong +-0.9655893 going on ski +-0.5417812 on ski trips +-0.9710908 trips , traveling +-1.2694414 world and enjoying +-1.5664853 instead of enjoying +-0.96879 will for probably +-2.1320326 students , probably +-0.9583013 fll also probably +-0.93076104 earning those dollars +-0.96827096 party , namely +-0.96827096 dollars , namely +-2.512195 , and consequently +-1.2606384 restaurants and consequently +-0.86392206 tertiary educational institutes +-0.7775762 and loan institutes +-2.4356785 the student household +-0.9655786 activities as household +-1.7390559 outside of parental +-0.5417812 of parental accommodation +-0.9447781 is largely financed +-0.8486092 parents or guardians +-0.70563835 an enormous sum +-2.2979906 to be subsidized +-1.4447888 by a third +-2.6843307 , and third +-0.9447781 is rarely adequate +-1.3489032 to balance expenditure +-0.9660901 yet it yields +-0.9703562 yields a double +-0.9465293 Massachusetts after double +-0.70563835 a double bonus +-1.5687809 consider the character-building +-0.5417812 the character-building aspect +-2.0228894 part-time work influences +-1.6550909 necessary for younger +-0.70563835 as household earners +-1.6138352 their future professions +-0.95146203 freedom in terms +-0.95146203 costly in terms +-0.95146203 professions in terms +-1.2709996 commitment , diligence +-2.250717 can be said +-0.96857595 With that said +-1.2611248 That said +-0.97076845 merely the beginning +-1.8041439 studies and beginning +-2.1495302 This relief +-1.2350495 a great relief +-0.9440582 all -LRB- due +-1.9177864 of college due +-0.70534635 great relief due +-0.5415809 rightly unread due +-0.8885696 has been shown +-2.2979906 to be traumatizing +-2.712588 I totally +-1.8016713 reason is pretty +-0.94892174 teacher has similarities +-1.8674259 that will essentially +-1.2691407 learning and essentially +-0.953751 school we essentially +-1.2709996 up , sit +-0.5417812 , sit quietly +-1.2014927 on our feet +-0.97038597 immediate and spontaneous +-0.5417812 and spontaneous feedback +-0.9653139 well or aren +-2.1505525 This dimension +-1.9327028 having to perform +-1.5571167 expected to perform +-1.947449 , is somewhat +-0.5417812 is somewhat lacking +-0.94452345 can spend anyway +-1.5353667 my opinion anyway +-0.9519091 really only remember +-0.9096709 I still remember +-1.2131006 my first paycheck +-2.3053756 and the impression +-2.229128 that it gave +-0.9585499 It also seemed +-1.8541112 , that peculiar +-0.5417812 that peculiar sort +-0.95854473 lot more cautious +-1.3289149 and less apt +-0.7055833 money c Yet +-1.4299678 job I guess +-1.5685177 be the primary +-2.3173888 The primary +-0.8640201 whether club membership +-1.947449 , is supplementary +-1.1014369 the local fast-food +-0.97038597 restaurant and hopes +-2.2413182 as a computer +-0.5417812 a computer programmer +-0.9703562 flip a burger +-1.8067538 , then burger +-0.70563835 then burger flipping +-2.594901 have a decidedly +-2.5876513 have a negative +-0.964643 positive or negative +-1.2338123 the potential negative +-0.5414925 a decidedly negative +-0.70521736 is negligible negative +-0.97041446 particularly a first-year +-0.86400133 is already stretched +-0.5417812 already stretched thin +-1.269442 requirements of college-level +-2.026697 there is negligible +-0.7775762 is almost negligible +-2.4202912 should be fine +-0.9709693 decision to take-up +-1.7176094 , not solely +-0.9446826 be left solely +-0.9683921 at students ' +-0.9348126 on restaurants ' +-0.70563835 students ' discretion +-2.1568217 them to sharpen +-0.9708349 engineering the in-class +-1.8542573 time is supplemented +-0.7055833 for lab experiments +-0.54174346 or chemistry experiments +-1.2595508 experiments or casework +-1.5500416 makes it virtually +-1.7236131 there are virtually +-0.7055833 it virtually impossible +-0.7055833 become near impossible +-1.774373 and a heavy +-1.7165507 on a heavy +-2.6336813 to the detriment +-1.5648742 engage in physical +-1.7315739 and their physical +-2.482005 is a health +-2.0253468 for the health +-2.0003614 to the health +-1.9326061 on the health +-1.2079276 also the health +-0.93832946 since the health +-0.93832946 affecting the health +-0.93832946 concerning the health +-0.94221365 about any health +-0.95681995 rights and health +-0.95681995 welfare and health +-0.95681995 comfort and health +-0.96953917 prone to health +-1.4332628 , their health +-0.9384824 person 's health +-0.8861649 their potential health +-1.2892576 their personal health +-1.6883714 their own health +-0.843349 potential negative health +-0.70405877 their physical health +-0.5406974 Despite proven health +-0.5406974 by improving health +-2.1505525 This applies +-0.96644884 must have extracurricular +-0.9710908 more , picking +-2.343902 in a healthy +-0.9655088 sustain a healthy +-0.9708606 body , healthy +-0.70534635 definitely wants healthy +-1.3159499 a few healthy +-2.034237 such as devoting +-2.183912 time to exercising +-0.970309 exercising and recreational +-0.9687079 pursuing their recreational +-0.96877724 improve their resumes +-1.5680873 them a well-rounded +-2.3892295 they are introduced +-2.6336813 to the tedious +-0.9620837 this in mind +-0.9620837 excellence in mind +-1.2656678 when their mind +-1.5186512 in my mind +-0.844898 , healthy mind +-0.94732565 its most curious +-1.1843383 a new phase +-2.1214323 will be encountering +-1.2631776 this they share +-2.034237 such as swimming +-1.5473053 class or chemistry +-1.1843383 a new light +-0.5417812 new light bulb +-2.4202912 should be cherished +-2.0936038 they will miss +-2.101911 student to miss +-1.2239969 the social ideas +-1.4049758 the main ideas +-0.92502123 creating new ideas +-0.5415809 and sharing ideas +-1.4452091 understanding the academics +-2.548219 in the dormitory +-2.4184859 should be offered +-1.0705076 in activities offered +-0.8591219 college activities offered +-1.1838939 by being exposed +-2.156481 students are exposed +-0.9554702 ill when exposed +-1.44561 exposed to group +-2.0118377 with their peers +-1.2695947 together and sharing +-1.9587287 by the dorm +-1.710038 the same dorm +-0.97041446 you a permanent +-0.5417812 a permanent bond +-0.94449526 to fall behind +-0.94449526 be left behind +-0.7054753 lag significantly behind +-0.96602863 themselves it won +-1.7314224 because they won +-0.70563835 they won ` +-0.5417812 won ` t +-2.4882603 able to participate +-1.7966484 time for awareness +-2.0201955 student is worried +-1.7411864 it fs terribly +-0.9255078 by without starving +-1.4023011 I would deem +-2.3353806 of a dollar +-2.0647686 If dollar +-2.100974 that the 40 +-0.9650539 30 or 40 +-0.86376655 for around 40 +-0.9709693 40 to 45 +-1.9782741 for their exams +-0.8186068 , pass exams +-1.2248082 into social clicks +-0.70563835 , avoid drugs +-1.6874481 and social rejection +-2.3892295 they are thrown +-1.2537072 the working dogs +-2.5689857 for the remainder +-1.2015057 to other paths +-0.96889466 not for distracting +-1.2641429 which are distracting +-0.92548186 which was original +-2.3992348 a student decides +-1.8689821 that will deter +-0.9710908 excel , he\/she +-0.9597012 put all energies +-2.0538197 that is conducive +-0.8971586 student makes decisions +-0.96563625 decisions as open +-1.221267 a much larger +-0.9607388 to grow larger +-0.70563835 much larger possibility +-2.5528169 , the administration +-0.96952754 administration is treating +-2.347839 a job poses +-0.97041446 poses a threat +-0.97041446 quite a distracter +-1.2657207 a students boils +-0.84507585 to hold down +-0.54166937 students boils down +-0.77750957 days holding down +-1.2105547 is about networking +-1.2695947 grades and landing +-2.0201955 student is ideal +-1.6395347 and not properly +-0.96078736 any jobs properly +-0.8881657 anything done properly +-1.1435627 to eat properly +-0.9709693 resources to train +-1.7406356 on a temporary +-2.546066 in the pursuit +-1.212927 on his pursuit +-0.9502854 University studies lay +-2.1025329 student to forgo +-0.8186782 make him physically +-0.8426969 Any forms +-0.97025436 forms of exhaustion +-0.96306306 suffering from exhaustion +-0.70563835 of exhaustion interferes +-1.4446744 concept of offering +-2.4202912 should be weighed +-0.7776412 this economic crisis +-0.63075274 find a middle +-0.9710474 workers , middle +-0.7776412 , middle aged +-0.70563835 , married couples +-1.3737625 and even elderly +-1.267528 society is heavily +-1.8856301 who are heavily +-0.54166937 their profits heavily +-1.2709996 debt , severely +-0.9539771 be so disparate +-0.9575429 eventually do dropout +-1.5693139 due to losing +-1.3682667 They become trapped +-2.3556416 with the image +-0.7776412 of material goods +-1.4138292 and an exciting +-1.0481644 have earned versus +-0.84262145 a disadvantage versus +-0.9708349 versus the boring +-1.269442 life of uninteresting +-2.548219 in the media +-1.2696514 within a typical +-0.970309 smaller and smaller +-0.9254592 is getting smaller +-1.2591032 family can possibly +-2.6843307 , and possibly +-0.70563835 can possibly gather +-1.3617146 students often seek +-1.4454614 forced to seek +-1.6004952 , many establishments +-0.77761716 at entertainment establishments +-0.970309 Restaurants and bars +-2.0335622 such as bars +-0.970309 beer and nightclubs +-0.9652262 bars or nightclubs +-1.2490816 be an obvious +-0.6219591 and most obvious +-1.5220007 be more attracted +-2.630871 to the lucrative +-0.9480326 to very lucrative +-0.70563835 the lucrative tips +-1.8616375 at a Las +-0.9694723 tables in Las +-0.34242433 a Las Vegas +-0.34242433 in Las Vegas +-0.8426969 Las Vegas casino +-2.0582547 would be mesmerized +-0.9703093 potential of winning +-2.1323423 at the tables +-2.5330536 is a nationwide +-0.5417812 a nationwide boom +-0.9654407 boom with Texas +-1.270913 Third , Texas +-0.34242433 with Texas Hold +-0.34242433 , Texas Hold +-0.3171513 Texas Hold eEm +-0.3171513 Hold eEm Poker +-0.9694723 Poker in North +-0.54174346 has swept North +-0.9702419 enter a poker +-0.89685124 become professional poker +-0.7774487 Playing online poker +-1.2197042 many young players +-0.7775762 professional poker players +-2.260049 of a diploma +-0.96581453 pursuing a diploma +-0.9480326 still very popular +-0.9539014 become so popular +-0.96644884 players have evolved +-1.7890856 you are skilled +-1.3955564 with some luck +-2.5504236 , the prize +-0.54174346 the grand prize +-0.9254905 This new trend +-0.94892174 trend has swept +-0.97038597 America and continues +-1.1843383 the new addictive +-0.5417812 new addictive drug +-1.2617884 You have access +-0.8426969 to play 24 +-1.9164995 chance to win +-1.1442131 and ultimately win +-0.70563835 to win substantial +-0.8186782 substantial cash prizes +-0.7776412 a poker tournament +-0.9708349 win the grand +-1.2492541 for an incomplete +-1.1480799 having to worry +-0.9678614 much to worry +-1.2707573 willing to invest +-1.6383204 student will respond +-0.94556415 his\/her parents proud +-2.2551448 can be plainly +-2.066828 can be divided +-1.2391287 always be divided +-2.814478 of the goals +-0.54174346 long term goals +-0.87795234 were once established +-0.9556397 established when applying +-1.3661013 My brother +-0.8186068 he finally dropped +-0.54174346 My brother dropped +-1.8051585 out of Northeastern +-0.96952105 University in Massachusetts +-0.70563835 after double majoring +-0.96952105 majoring in physics +-1.4184357 working at Disney +-0.9623362 fs working holiday +-0.9708349 Also the chances +-2.8066833 of the partying +-2.0322413 such as partying +-0.9650539 studying or partying +-0.95854473 other more serious +-1.8089019 A budgeted +-1.2277694 parents could stem +-0.7776412 the partying funds +-1.0482743 should concentrate 100 +-0.9596702 then this distracts +-2.4044886 is a reasonable +-1.8297209 at a reasonable +-2.0840614 at the restaurants +-0.9663688 Inside the restaurants +-1.525109 jobs in restaurants +-0.9388047 ban in restaurants +-0.4552181 smoking in restaurants +-0.9388047 sections in restaurants +-0.9701217 out to restaurants +-0.9646609 and on restaurants +-0.9638222 stores or restaurants +-0.9324817 customers at restaurants +-0.9324817 smoking at restaurants +-0.2478974 in all restaurants +-1.1025386 from all restaurants +-1.3280149 at all restaurants +-1.0461817 and non-smoking restaurants +-0.541139 that includes restaurants +-0.91470444 our friends whenever +-1.5671173 into a vicious +-1.5361018 Students should postpone +-1.4452091 entering the greal +-2.1573694 to do housework +-0.97038597 housework and chores +-0.96996903 place a ban +-0.45430523 reason to ban +-0.96748304 campaigns to ban +-0.70521736 an outright ban +-0.38060558 a total ban +-0.62177587 a smoking ban +-0.95398855 employers who employ +-1.6524273 time they possess +-1.269442 pursuit of Mammon +-0.9580144 Mammon fs glittering +-0.5417812 fs glittering bounty +-0.8885696 is both wrong-headed +-0.97038597 wrong-headed and short-sighted +-0.8186782 has ever known +-2.1600668 college student appreciates +-2.6336813 to the acquisition +-0.9596702 Interrupting this headlong +-0.5417812 this headlong charge +-0.96894807 charge for enlightenment +-1.269442 work of demons +-0.9657569 generations a man +-0.9657569 profit a man +-1.5479839 a young man +-0.94109654 lose his bursary +-0.94453686 -LRB- Beware +-1.7033607 those who extol +-0.9703093 day of reckoning +-0.8453008 spend every waking +-0.8186782 waking hour digesting +-0.8885696 the great dollops +-0.9703093 dollops of nourishing +-0.5417812 of nourishing scholarship +-0.5417812 nourishing scholarship fed +-0.70563835 concerned teaching assistants +-0.70563835 is newly coined +-2.2979906 to be dispensed +-0.97038597 dispensed and mastered +-0.70563835 If dollar signs +-0.96771944 signs are dancing +-1.94782 , if raw +-0.5417812 if raw cupidity +-0.5417812 raw cupidity corrupts +-0.94109654 corrupts his soul +-1.94782 , if contemplating +-0.5417812 if contemplating materialist +-0.5417812 contemplating materialist purchases +-0.5417812 materialist purchases disturbs +-0.9596702 Disrupting this evolving +-0.5417812 this evolving relationship +-0.95859265 study by saddling +-0.9708349 saddling the questing +-1.2695354 obtaining a minimum +-0.9710474 menial , minimum +-0.70563835 minimum wage drudgery +-1.6743882 , by interfering +-2.3556416 with the ceaseless +-0.5417812 the ceaseless cogitation +-1.3968394 , one embraces +-0.96005386 embraces an obscene +-0.5417812 an obscene perversion +-2.0538197 that is holy +-0.9708349 To the carrels +-0.96894807 stand for undivided +-0.9306314 his academic attention +-0.54174346 for undivided attention +-0.9502854 our studies across +-0.9254905 our new barricades +-0.70563835 the so-called gGreat +-0.5417812 so-called gGreat Works +-0.8186068 Works h \ +-0.54174346 and misogyny \ +-0.70563835 h \ rightly +-0.5417812 \ rightly unread +-2.3159316 to their relentless +-0.5417812 their relentless racism +-0.97038597 racism and misogyny +-0.9348586 be our cobblestones +-1.5697957 world , unite +-0.7776412 will essentially shape +-0.97038597 for and passing +-0.5417812 and passing examinations +-0.70563835 at convenience stores +-0.9447781 to stay awake +-1.531915 to them failing +-0.92549914 even being asked +-1.2663147 anything that obstructs +-2.4202912 should be discouraged +-0.97041446 education a tool +-1.2663965 put their sons +-0.9653139 sons or daughters +-1.4391036 have their dreams +-0.5417812 their dreams dashed +-0.9655893 effort on behalf +-1.8380213 not only hurting +-1.6741663 but also destroying +-1.2678738 part-time in low-skilled +-2.4882603 able to flip +-0.877903 no importance whatsoever +-1.84251 may not pertain +-2.546066 in the U.S. +-1.2246652 In most U.S. +-1.2597992 I will voice +-2.0201955 student is admitted +-0.5417812 Conversely speaking +-2.548219 in the United +-0.5417812 the United States +-0.96671176 quite time consuming +-1.9262036 the time commuting +-0.9694666 classes is merely +-1.6721942 rather than merely +-1.2704892 even the obligatory +-2.0379982 order to sustain +-1.0482398 a healthy existence +-2.0921752 they will eat +-2.0784652 time to eat +-2.0476751 them to eat +-1.2549186 place to eat +-0.96005267 where people eat +-0.9655786 -LRB- as opposed +-1.4532362 I am opposed +-0.96952754 much is taken-on +-2.0020587 may be compromised +-0.96550435 complete with excellence +-0.96889466 finishing for finishing +-0.7055833 than merely finishing +-0.70563835 for finishing sake +-2.061142 not be burdened +-1.4436442 responsibility of fiscal +-1.6106554 and that none +-0.9594153 seems that none +-0.9623525 none should assume +-2.8185043 of the academically +-0.5417812 the academically minded +-1.8089019 A high-school +-0.8778697 student generally focuses +-0.9703093 attention of mastering +-2.5528169 , the foremost +-1.2796856 to explore concepts +-0.7776412 much greater detail +-0.9694723 an in depth +-0.970309 detail and depth +-0.97038597 thought and analysis +-0.97076845 Balancing the mundane +-0.96306306 free from mundane +-0.96857595 feel that holding +-0.97014666 realities of holding +-1.451693 college days holding +-1.4981937 will only distract +-0.8777878 will simply distract +-2.6879861 , and hamstring +-0.70563835 the overall efficacy +-0.7776412 with developing himself +-0.9710908 stable , confident +-1.349635 will be plenty +-0.96544003 days will negatively +-0.8778697 impact long term +-0.8452535 in times past +-0.8777878 are long past +-0.9348586 In our father +-0.97038597 fs and grandfather +-1.5699875 work to supply +-0.97041446 supply a modest +-0.97025436 family of five +-1.4454614 come to five +-0.94109654 allow his wife +-1.7395061 home and carrying +-2.3556416 with the entrance +-1.7421509 into the work-force +-0.5417812 the work-force coupled +-1.6469297 has become near +-0.54174346 I graduated near +-1.2707573 impossible to adequately +-2.3364315 of a four-year +-0.9653139 man or woman +-0.8970779 of fully dedicating +-0.92243314 Achieving high marks +-1.7211647 , time devoted +-2.4202912 should be safe-guarded +-2.6336813 to the maximum +-1.6414187 and not diluted +-0.9654407 diluted with worries +-0.7055833 from mundane worries +-0.7776412 or schedules associated +-0.97038597 obtain and hold-down +-2.1296859 is the ultimate +-1.3098111 am against anybody +-0.8885696 make great advances +-0.9433873 will become distracted +-1.6567447 they get distracted +-1.2597992 studies will falter +-2.3159316 to their schoolwork +-0.9710908 nights , standing +-2.4549673 in a hurry +-1.2637693 even be pressured +-0.97038597 years and running +-0.845408 of becoming overwhelmed +-0.89710486 A financially strapped +-0.8453008 reduce his\/her reliance +-1.4023011 it would behoove +-0.9710908 following , valid +-0.5417812 , valid counter-argument +-1.269442 is of paramount +-1.7396736 become a doctor +-0.96514726 I can tolerate +-1.5693139 means to repay +-2.401044 , I seriously +-0.5417812 I seriously jeopardize +-1.5887454 To coin +-0.96952105 currently in vogue +-2.3173888 The expression +-0.54174346 in vogue expression +-0.9710908 expression , gI +-0.8778697 gI am robbing +-0.5417812 am robbing Peter +-1.7539203 to pay Paul +-0.5417812 pay Paul h. +-1.6404386 during college rests +-2.3192668 The onus +-2.0303597 , as innumerable +-1.2695947 -LRB- and enlightened +-1.6340977 I was 18 +-1.5516561 would have liked +-1.919632 it is nonetheless +-1.445843 is , nonetheless +-0.93925536 A better approach +-2.145156 This approach +-0.5415809 complete opposite approach +-0.5415809 a rational approach +-1.639368 as I graduated +-1.4452091 pay the bills +-0.97041446 make a smooth +-1.1188048 be learned here +-0.88812464 rings true here +-0.87743556 main problem here +-0.5415809 be examined here +-1.2455192 have already started +-0.96203786 some money aside +-0.54174346 Putting aside +-1.566702 that of etime +-0.9556397 study when workloads +-0.96952105 high in volume +-1.7415998 work a shift +-0.5417812 a shift afterwards +-0.70563835 The expression ehealthy +-0.5417812 expression ehealthy body +-0.8885696 mind f rings +-2.3892295 they are fatigued +-0.96313107 fatigued from over-exertion +-2.1323423 at the weekend +-2.189477 they have availability +-1.2485076 but this negates +-2.3364315 of a 5-day +-1.9554842 with a 6 +-0.96952754 problem is compounded +-1.6540626 and that single +-1.2667356 required for relaxation +-2.2979906 to be held +-0.7776412 are rarely relied +-0.8186782 by anyone else +-2.4882603 able to utilize +-2.594901 have a detrimental +-2.3320854 on their outlook +-2.001421 lead to feelings +-0.9710908 frustration , depression +-1.5516689 or even pessimism +-2.2979906 to be honest +-1.2602372 wish I hadn +-1.2663965 change their perspective +-0.9623362 up working 30 +-0.97038597 week and hardly +-1.5693139 trying to catch +-1.2602372 which I ended +-1.2600931 spent on cars +-0.9710474 cars , fancy +-0.9579728 did my fancy +-0.9710908 clothes , cigarettes +-0.9710908 cigarettes , beer +-0.8886119 finally did drop +-0.96566236 employers I interviewed +-0.96550435 interviewed with cared +-2.2413182 as a bartender +-0.9447781 fancy clothes impress +-0.9710908 sure , anti-smoking +-0.7776412 Japan currently lag +-0.95589554 smoking would significantly +-0.54174346 currently lag significantly +-0.8453008 other developed nations +-1.0065018 and further progress +-0.9657569 Yet a total +-0.9657569 Implementing a total +-0.63090694 reduce the total +-1.4049516 say that smoking +-1.1312816 believe that smoking +-2.0916193 , a smoking +-2.1310654 of a smoking +-0.95611274 contracting a smoking +-0.96903455 risks , smoking +-1.5565242 effects of smoking +-1.9480665 for their smoking +-1.3126934 Of course smoking +-0.91499764 on restaurant smoking +-0.5376871 ban on smoking +-1.2207794 restrictions on smoking +-0.9311158 not themselves smoking +-2.0226793 I believe smoking +-0.8778926 taking up smoking +-0.8778926 give up smoking +-1.2699889 to quit smoking +-1.2853183 is always smoking +-0.7745659 as regular smoking +-0.87427455 already banned smoking +-0.30733097 to ban smoking +-0.15623896 for banning smoking +-0.233282 , banning smoking +-0.233282 by banning smoking +-0.15623896 of banning smoking +-0.5399919 is passive smoking +-0.5399919 Banning smoking +-0.5399919 to discourage smoking +-0.5399919 to introduce smoking +-0.5399919 to segregate smoking +-0.9559639 and would undoubtedly +-0.7776412 meet considerable resistance +-0.9328095 virtually no restrictions +-0.54174346 , partial restrictions +-1.2413292 restaurants would represent +-0.8885845 the complete opposite +-0.9514415 Between these polar +-0.5417812 these polar extremes +-0.9447781 a middle route +-1.0482398 is certainly advisable +-0.9709693 advisable to urge +-2.7230322 of the smoke +-1.2665644 amounts of smoke +-1.5625769 not to smoke +-1.7635506 them to smoke +-1.1946555 place to smoke +-0.93114495 individual to smoke +-1.4291619 allowed to smoke +-1.3362124 choose to smoke +-1.512289 wish to smoke +-0.93114495 rights to smoke +-1.1946555 exposure to smoke +-0.9046809 , who smoke +-1.4793116 those who smoke +-1.1521136 can still smoke +-0.88677007 second hand smoke +-0.34194267 , breathing smoke +-0.34194267 - breathing smoke +-0.70418733 are inhaling smoke +-0.21864995 that second-hand smoke +-0.21864995 of second-hand smoke +-0.21864995 to second-hand smoke +-0.77592903 to tobacco smoke +-0.23563899 Tobacco smoke +-0.54078573 people breathe smoke +-0.54078573 of secondhand smoke +-1.270913 so , regardless +-1.999989 , but regardless +-0.454951 health and well-being +-1.904193 , in considering +-2.474476 of the rights +-2.368377 to the rights +-1.7569928 against the rights +-0.9687079 within their rights +-1.353842 and health concerns +-1.6070328 In many places +-0.96748435 Restaurants are places +-0.6092521 in public places +-0.6092521 all public places +-1.5697957 world , partial +-1.6262265 reasons for banning +-0.9598097 justification for banning +-0.9707589 arguments , banning +-0.9580587 solely by banning +-0.96497023 effect of banning +-1.549747 purpose of banning +-0.8633952 without completely banning +-1.3568981 have been implemented +-2.2551448 can be legally +-0.5417812 be legally compelled +-1.5668873 provide a separate +-0.88862646 into two separate +-0.9710908 separate , ventilated +-0.5417812 , ventilated room +-0.9709698 of restaurant patrons +-0.7952932 those restaurant patrons +-0.94726676 their smoking patrons +-2.217416 , or alternatively +-0.94965845 afford such renovations +-0.4549004 smoking and non-smoking +-1.7775552 health of non-smoking +-0.96522456 comfort of non-smoking +-0.8968245 a fully non-smoking +-0.8453008 fully non-smoking establishment +-1.2455192 have already built +-0.94965845 built such facilities +-0.8186782 the current lax +-0.5417812 current lax standards +-0.70563835 a nice meal +-0.9255078 meal without exposing +-1.3220913 of getting cancer +-0.9710908 cancer , asthma +-1.2015057 and other respiratory +-1.4452091 are the worst +-0.5417812 the worst sufferers +-2.814478 of the ill +-0.8777878 become quite ill +-1.5673865 consider the effects +-0.9303269 negative health effects +-0.70534635 the ill effects +-0.5415809 clearly harmful effects +-0.8426969 Smoking alone +-0.5417812 Smoking alone kills +-1.1342123 and makes hundreds +-1.2482166 of them sick +-0.9424286 banned because nowadays +-1.1842161 seems like everywhere +-1.0481644 the law everywhere +-1.270913 restaurants , breathing +-0.8186068 smoke - breathing +-1.269442 loss of appetite +-1.7148496 and it smells +-0.95353854 means some harm +-0.96514726 Nothing can justify +-2.630871 to the interference +-0.54174346 can justify interference +-1.4310355 is it justified +-0.54174346 are perfectly justified +-0.9650865 smoke can affect +-0.87781 smoke does affect +-2.5330536 is a moral +-0.84262145 a reasonable justification +-0.54174346 a moral justification +-2.8185043 of the @ +-0.97038597 aware and guide +-1.2590392 -LRB- or choices +-1.5940906 the right choices +-0.7054753 similarly wise choices +-0.9708349 therefore the warnings +-2.3053756 and the campaigns +-0.96771944 smoking are perfectly +-1.5636237 smoking is passive +-1.7886008 you are inhaling +-1.3218164 is like inhaling +-1.6662002 smoke , unknowingly +-0.96771944 unknowingly are affecting +-0.9708349 within the premise +-2.5330536 is a rational +-2.8185043 of the employee +-0.9255064 countries like UK +-0.97038597 UK and parts +-0.9703093 parts of US +-0.9307381 a health conscious +-0.8453008 wants healthy citizens +-1.7207206 all restaurants despite +-1.3100164 it means infringement +-0.96857595 effect that second-hand +-1.5660601 effects of second-hand +-1.2704629 exposure to second-hand +-0.9708606 mind , non-smokers +-1.8044913 health of non-smokers +-0.96529967 has on non-smokers +-1.8840929 who are non-smokers +-1.5220007 are more prone +-1.5621343 good for smokers +-2.056268 not be smokers +-0.9704825 than the smokers +-0.9271531 number of smokers +-1.7768588 health of smokers +-2.5249763 have a smoke-free +-1.2605371 enjoy a smoke-free +-1.8689821 that will occur +-1.2695947 restaurants and potentially +-0.9703093 week of unwanted +-0.54174346 of unwanted exposure +-0.54174346 Regular exposure +-2.073563 is that tobacco +-0.9708217 customers to tobacco +-0.94712085 course smoking tobacco +-2.548219 in the air +-1.443957 has a definite +-2.0538197 that is served +-2.3192668 The smell +-0.8186782 smoke becomes entangled +-1.5693139 hard to tell +-0.9282507 food really tastes +-0.87781 is banned altogether +-0.9348126 non-smoking restaurants altogether +-1.5687809 consider the welfare +-1.6540626 and that includes +-2.531173 is a poison +-0.54174346 a habit-forming poison +-0.44458324 reason being anywhere +-0.3171513 being anywhere close +-1.3766972 Tobacco smoke permeates +-0.5417812 smoke permeates foods +-0.9602795 when people breathe +-2.3423157 , it alters +-2.5330536 is a habit-forming +-1.8048636 and a stimulant +-1.2496934 Many people react +-0.5417812 people react violently +-2.189477 they have heard +-1.6777825 in a smoky +-1.3351636 is too smoky +-1.2610925 's not fair +-0.9366768 goes through withdrawal +-0.5417812 through withdrawal pains +-1.3682667 They become addicted +-1.9548753 is an insult +-1.5671173 also a safety +-1.2709996 law , selfish +-0.96894807 arguments for implementing +-0.9674512 all be examined +-2.0754464 is that concerning +-0.97076845 made the wise +-0.7055833 made similarly wise +-1.238976 , be forced +-1.9353607 not be forced +-1.2641429 people are forced +-1.44561 forced to compromise +-0.7776412 wise choices regarding +-1.7034411 their own lifestyles +-0.9709693 government to discourage +-0.94732565 Perhaps most importantly +-0.94460577 taken into account +-2.4549673 in a smoke-filled +-2.061142 not be ignored +-0.9710908 additional , long-term +-2.3714485 on the national +-0.5417812 the national healthcare +-0.70563835 to seek treatment +-0.96894807 treatment for tobacco-related +-0.5417812 for tobacco-related illnesses +-1.5470109 it 's context +-0.9307304 arguments ; firstly +-0.9709693 restaurants to promote +-0.9674895 improve the comfort +-0.9674895 promote the comfort +-1.0482398 of non-smoking diners +-0.8426969 Despite proven +-0.70563835 very popular amongst +-0.9597012 amongst all ages +-0.9473217 highly social pastime +-1.6473839 has become deeply +-0.5417812 become deeply integrated +-0.9703093 chance of passively +-0.5417812 of passively contracting +-0.9099368 smoking related illness +-0.93079174 perhaps too severe +-0.96877724 damage their profits +-1.44561 be to introduce +-1.0482398 and non-smoking sections +-1.5693139 or to segregate +-0.95028013 environment which suits +-0.9709693 businesses to succeed +-0.93973964 brought about gradually +-1.6743882 , by improving +-2.8185043 of the clearly +-0.5417812 the clearly harmful +-1.566702 effects of secondhand + +\end\ diff --git a/include/test/lm_test.h b/include/test/lm_test.h new file mode 100644 index 000000000..b542aca9d --- /dev/null +++ b/include/test/lm_test.h @@ -0,0 +1,31 @@ +/** + * @file lm_test.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_LM_TEST_H_ +#define META_LM_TEST_H_ + +#include + +#include "test/unit_test.h" + +namespace meta +{ +namespace testing +{ +/** + * Runs all the language model tests, comparing to KenLM as the reference + * implementation. + * @see https://kheafield.com/code/kenlm/ + * @return the number of tests failed + */ +int lm_tests(); +} +} + +#endif diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index 4c07df0a5..eb7614202 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -17,20 +17,6 @@ using namespace meta; int main(int argc, char* argv[]) { - logging::set_cerr_logging(); - lm::language_model model{cpptoml::parse_file(argv[1])}; - lm::sentence s1{"I disagree with this statement for several reasons .", - false}; - std::cout << s1.to_string() << ": " << model.log_prob(s1) << std::endl; - lm::sentence s2{"I disagree with this octopus for several reasons .", - false}; - std::cout << s2.to_string() << ": " << model.log_prob(s2) << std::endl; - lm::sentence s3{"Hello world !", false}; - std::cout << s3.to_string() << ": " << model.log_prob(s3) << std::endl; - lm::sentence s4{"xyz xyz xyz", false}; - std::cout << s4.to_string() << ": " << model.log_prob(s4) << std::endl; - - /* if (argc != 3) { std::cerr << "Usage: " << argv[0] << " config.toml sentences.txt" @@ -38,6 +24,8 @@ int main(int argc, char* argv[]) return 1; } + logging::set_cerr_logging(); + lm::diff correcter{cpptoml::parse_file(argv[1])}; std::ifstream in{argv[2]}; auto num_sentences = filesystem::num_lines(argv[2]); @@ -78,5 +66,4 @@ int main(int argc, char* argv[]) prog.end(); std::cout << "Percent no-ops: " << do_nothing / done << std::endl; - */ } diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 026d20c0f..42f61132e 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -10,6 +10,7 @@ add_library(meta-testing analyzer_test.cpp libsvm_parser_test.cpp parallel_test.cpp ranker_test.cpp + lm_test.cpp stemmer_test.cpp string_list_test.cpp graph_test.cpp diff --git a/src/test/inverted_index_test.cpp b/src/test/inverted_index_test.cpp index f8e1da375..a668c2a87 100644 --- a/src/test/inverted_index_test.cpp +++ b/src/test/inverted_index_test.cpp @@ -56,7 +56,8 @@ void create_config(const std::string& corpus_type) << "[[analyzers]]\n" << "method = \"ngram-word\"\n" << "ngram = 1\n" - << "filter = \"default-chain\""; + << "filter = \"default-chain\"\n" + << "[language-model]\narpa-file = \"../data/english-sentences.arpa\""; } template diff --git a/src/test/lm_test.cpp b/src/test/lm_test.cpp new file mode 100644 index 000000000..4076b13a2 --- /dev/null +++ b/src/test/lm_test.cpp @@ -0,0 +1,41 @@ +/** + * @file lm_test.cpp + * @author Sean Massung + */ + +#include "lm/sentence.h" +#include "lm/language_model.h" +#include "test/lm_test.h" +#include "test/inverted_index_test.h" + +namespace meta +{ +namespace testing +{ + +int lm_tests() +{ + int num_failed = 0; + create_config("line"); + + num_failed += testing::run_test( + "lm-test", [&]() + { + lm::language_model model{cpptoml::parse_file("config.toml")}; + lm::sentence s1{ + "I disagree with this statement for several reasons .", false}; + lm::sentence s2{ + "I disagree with this octopus for several reasons .", false}; + lm::sentence s3{"Hello world !", false}; + lm::sentence s4{"xyz xyz xyz", false}; + + ASSERT_APPROX_EQUAL(model.log_prob(s1), -5.0682507); + ASSERT_APPROX_EQUAL(model.log_prob(s2), -11.7275571); + ASSERT_APPROX_EQUAL(model.log_prob(s3), -11.0764951); + ASSERT_APPROX_EQUAL(model.log_prob(s4), -16.4180412); + }); + + return num_failed; +} +} +} diff --git a/src/test/tools/unit-test.cpp b/src/test/tools/unit-test.cpp index aadf37bec..8bc142b1d 100644 --- a/src/test/tools/unit-test.cpp +++ b/src/test/tools/unit-test.cpp @@ -21,6 +21,7 @@ #include "test/graph_test.h" #include "test/compression_test.h" #include "test/parser_test.h" +#include "test/lm_test.h" #include "util/printing.h" using namespace meta; @@ -47,6 +48,7 @@ int main(int argc, char* argv[]) std::cerr << " \"compression\": runs compression reading and writing tests" << std::endl; std::cerr << " \"graph\": runs undirected and directed graph tests" << std::endl; std::cerr << " \"parser\": runs parser tests" << std::endl; + std::cerr << " \"language-model\": runs language model tests" << std::endl; return 1; } @@ -84,6 +86,8 @@ int main(int argc, char* argv[]) num_failed += testing::graph_tests(); if (all || args.find("parser") != args.end()) num_failed += testing::parser_tests(); + if (all || args.find("language-model") != args.end()) + num_failed += testing::lm_tests(); return num_failed; } diff --git a/src/test/unit_tests.cmake b/src/test/unit_tests.cmake index 11abacb23..a288fdd67 100644 --- a/src/test/unit_tests.cmake +++ b/src/test/unit_tests.cmake @@ -53,3 +53,7 @@ set_tests_properties(graph PROPERTIES TIMEOUT 10 WORKING_DIRECTORY add_test(parser ${UNIT_TEST_EXE} parser) set_tests_properties(parser PROPERTIES TIMEOUT 10 WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + +add_test(language-model ${UNIT_TEST_EXE} language-model) +set_tests_properties(language-model PROPERTIES TIMEOUT 10 WORKING_DIRECTORY + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) From 9f65ce19b97af798d3a0ea4e076ad86230702a3d Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 8 May 2015 15:22:17 -0500 Subject: [PATCH 118/481] store LM in multiple hash tables; one for each order --- include/lm/language_model.h | 2 +- src/lm/language_model.cpp | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index ca96ff0e2..4a3e903cb 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -102,7 +102,7 @@ class language_model float backoff; }; - std::unordered_map lm_; + std::vector> lm_; }; class language_model_exception : public std::runtime_error diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index bf8c592c2..76465f99e 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -32,13 +32,12 @@ void language_model::read_arpa_format(const std::string& arpa_file) // get to beginning of unigram data while (std::getline(infile, buffer)) - { if (buffer.find("\\1-grams:") == 0) break; - } N_ = 0; + lm_.push_back({}); // add current n-value data while (std::getline(infile, buffer)) { if (buffer.empty()) @@ -47,6 +46,7 @@ void language_model::read_arpa_format(const std::string& arpa_file) if (buffer[0] == '\\') { ++N_; + lm_.push_back({}); // add current n-value data continue; } @@ -57,20 +57,20 @@ void language_model::read_arpa_format(const std::string& arpa_file) float backoff = 0.0; if (second_tab != std::string::npos) backoff = std::stof(buffer.substr(second_tab + 1)); - lm_[ngram] = {prob, backoff}; + lm_[N_][ngram] = {prob, backoff}; } } std::string language_model::next_token(const sentence& tokens, double random) const { - throw language_model_exception{"could not generate next token: " - + tokens.to_string()}; + throw language_model_exception{"not implemented!"}; } std::vector> language_model::top_k(const sentence& prev, size_t k) const { + throw language_model_exception{"not implemented!"}; } std::string language_model::generate(unsigned int seed) const @@ -82,15 +82,15 @@ float language_model::prob_calc(sentence tokens) const { if (tokens.size() == 1) { - auto it = lm_.find(tokens[0]); - if (it != lm_.end()) + auto it = lm_[0].find(tokens[0]); + if (it != lm_[0].end()) return it->second.prob; - return lm_.at("").prob; + return lm_[0].at("").prob; } else { - auto it = lm_.find(tokens.to_string()); - if (it != lm_.end()) + auto it = lm_[tokens.size() - 1].find(tokens.to_string()); + if (it != lm_[tokens.size() - 1].end()) return it->second.prob; auto hist = tokens(0, tokens.size() - 1); @@ -98,13 +98,13 @@ float language_model::prob_calc(sentence tokens) const if (tokens.size() == 1) { hist = hist(0, 1); - auto it = lm_.find(hist[0]); - if (it == lm_.end()) + auto it = lm_[0].find(hist[0]); + if (it == lm_[0].end()) hist.substitute(0, ""); } - it = lm_.find(hist.to_string()); - if (it != lm_.end()) + it = lm_[hist.size() - 1].find(hist.to_string()); + if (it != lm_[hist.size() - 1].end()) return it->second.backoff + prob_calc(tokens); return prob_calc(tokens); } From e2b1fbfebcfd076e4f2b4bc7d6b96a20e66656b6 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 8 May 2015 19:08:58 -0500 Subject: [PATCH 119/481] language_model::top_k; don't force sentence tags in log_prob --- src/lm/language_model.cpp | 33 +++++++++++++++++++++++++++------ src/test/lm_test.cpp | 10 ++++++---- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 76465f99e..5d2e4c0fa 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -70,7 +70,31 @@ std::string language_model::next_token(const sentence& tokens, std::vector> language_model::top_k(const sentence& prev, size_t k) const { - throw language_model_exception{"not implemented!"}; + // this is horribly inefficient due to this LM's structure + using pair_t = std::pair; + auto comp = [](const pair_t& a, const pair_t& b) + { + return a.second > b.second; + }; + std::vector candidates; + sentence candidate = prev; + candidate.push_back("word"); // the last item is replaced each iteration + for (auto& word : lm_[0]) + { + candidate.substitute(candidate.size() - 1, word.first); + candidates.emplace_back(word.first, log_prob(candidate)); + std::push_heap(candidates.begin(), candidates.end(), comp); + if (candidates.size() > k) + { + std::pop_heap(candidates.begin(), candidates.end(), comp); + candidates.pop_back(); + } + } + + for (auto end = candidates.end(); end != candidates.begin(); --end) + std::pop_heap(candidates.begin(), end, comp); + + return candidates; } std::string language_model::generate(unsigned int seed) const @@ -112,8 +136,6 @@ float language_model::prob_calc(sentence tokens) const float language_model::log_prob(sentence tokens) const { - tokens.push_front(""); - tokens.push_back(""); float prob = 0.0f; // tokens < N @@ -139,8 +161,7 @@ float language_model::perplexity(const sentence& tokens) const { if (tokens.size() == 0) throw language_model_exception{"perplexity() called on empty sentence"}; - return std::pow( - 10.0, -(log_prob(tokens) / (tokens.size() + 2))); // +2 for and + return std::pow(10.0, -(log_prob(tokens) / tokens.size())); } float language_model::perplexity_per_word(const sentence& tokens) const @@ -148,7 +169,7 @@ float language_model::perplexity_per_word(const sentence& tokens) const if (tokens.size() == 0) throw language_model_exception{ "perplexity_per_word() called on empty sentence"}; - return perplexity(tokens) / (tokens.size() + 2); // +2 for and + return perplexity(tokens) / tokens.size(); } } } diff --git a/src/test/lm_test.cpp b/src/test/lm_test.cpp index 4076b13a2..9b0768348 100644 --- a/src/test/lm_test.cpp +++ b/src/test/lm_test.cpp @@ -23,11 +23,13 @@ int lm_tests() { lm::language_model model{cpptoml::parse_file("config.toml")}; lm::sentence s1{ - "I disagree with this statement for several reasons .", false}; + " I disagree with this statement for several reasons . ", + false}; lm::sentence s2{ - "I disagree with this octopus for several reasons .", false}; - lm::sentence s3{"Hello world !", false}; - lm::sentence s4{"xyz xyz xyz", false}; + " I disagree with this octopus for several reasons . ", + false}; + lm::sentence s3{" Hello world ! ", false}; + lm::sentence s4{" xyz xyz xyz ", false}; ASSERT_APPROX_EQUAL(model.log_prob(s1), -5.0682507); ASSERT_APPROX_EQUAL(model.log_prob(s2), -11.7275571); From 0600cd5f7597c8e7c5cd953f4438bd4a485c253d Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 8 May 2015 19:42:36 -0500 Subject: [PATCH 120/481] remove sentence generation and add some error handling --- include/lm/language_model.h | 13 ------------- src/lm/language_model.cpp | 16 ++++------------ 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 4a3e903cb..0cda52cb2 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -30,19 +30,6 @@ class language_model */ language_model(const cpptoml::table& config); - /** - * Randomly generates one token sequence based on and symbols. - * @return a random sequence of tokens based on this language model - */ - std::string generate(unsigned int seed) const; - - /** - * @param sentence The previous N - 1 tokens - * @param random A random number on [0, 1] used for choosing the next token - * @return the next token based on the previous tokens - */ - std::string next_token(const sentence& sen, double random) const; - /** * @param sentence A sequence of tokens * @return the perplexity of this token sequence given the current language diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 5d2e4c0fa..58b94289b 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -61,12 +61,6 @@ void language_model::read_arpa_format(const std::string& arpa_file) } } -std::string language_model::next_token(const sentence& tokens, - double random) const -{ - throw language_model_exception{"not implemented!"}; -} - std::vector> language_model::top_k(const sentence& prev, size_t k) const { @@ -81,7 +75,7 @@ std::vector> candidate.push_back("word"); // the last item is replaced each iteration for (auto& word : lm_[0]) { - candidate.substitute(candidate.size() - 1, word.first); + auto candidate = sentence{prev.to_string() + " " + word.first}; candidates.emplace_back(word.first, log_prob(candidate)); std::push_heap(candidates.begin(), candidates.end(), comp); if (candidates.size() > k) @@ -97,13 +91,11 @@ std::vector> return candidates; } -std::string language_model::generate(unsigned int seed) const -{ - return ""; -} - float language_model::prob_calc(sentence tokens) const { + if (tokens.size() == 0) + throw language_model_exception{"prob_calc: tokens is empty!"}; + if (tokens.size() == 1) { auto it = lm_[0].find(tokens[0]); From 4002ec23099cd61b3b6bf8de409f067b654a3078 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 10 May 2015 14:05:58 -0500 Subject: [PATCH 121/481] build unordered_map with bucket_size constructor parameter --- include/lm/language_model.h | 39 +++++++++++++++++++++---------------- src/lm/language_model.cpp | 15 +++++++++++--- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 0cda52cb2..9e69c59f4 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -23,6 +23,26 @@ namespace lm { class language_model { + private: + /** + * Simple struct to keep track of probabilities and backoff values. + */ + struct lm_node + { + lm_node() : prob{0.0f}, backoff{0.0f} + { + } + + lm_node(float p, float b) : prob{p}, backoff{b} + { + } + + float prob; + float backoff; + }; + + using map_t = std::unordered_map; + public: /** * Creates an N-gram language model based on the corpus specified in the @@ -57,7 +77,7 @@ class language_model * @return a sorted vector of likely next tokens */ std::vector> top_k(const sentence& prev, - size_t k) const; + size_t k) const; private: /** @@ -74,22 +94,7 @@ class language_model uint64_t N_; /// The "n" value for this n-gram language model - /** - * Simple struct to keep track of probabilities and backoff values. - */ - struct lm_node - { - lm_node(): - prob{0.0f}, backoff{0.0f} {} - - lm_node(float p, float b): - prob{p}, backoff{b} {} - - float prob; - float backoff; - }; - - std::vector> lm_; + std::vector lm_; }; class language_model_exception : public std::runtime_error diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 58b94289b..67f1bc9e9 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -30,14 +30,23 @@ void language_model::read_arpa_format(const std::string& arpa_file) std::ifstream infile{arpa_file}; std::string buffer; - // get to beginning of unigram data + // get to beginning of unigram data, saving the counts of each ngram type + std::vector count; while (std::getline(infile, buffer)) + { + if (buffer.find("ngram ") == 0) + { + auto equal = buffer.find_first_of("="); + count.emplace_back(std::stoi(buffer.substr(equal + 1))); + } + if (buffer.find("\\1-grams:") == 0) break; + } N_ = 0; - lm_.push_back({}); // add current n-value data + lm_.emplace_back(count[N_]); // add current n-value data while (std::getline(infile, buffer)) { if (buffer.empty()) @@ -46,7 +55,7 @@ void language_model::read_arpa_format(const std::string& arpa_file) if (buffer[0] == '\\') { ++N_; - lm_.push_back({}); // add current n-value data + lm_.emplace_back(count[N_]); // add current n-value data continue; } From 49b9b8b80f87f0ac566ceec846b347dcf0db033e Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 10 May 2015 14:21:32 -0500 Subject: [PATCH 122/481] time .arpa file loading for benchmark --- src/lm/language_model.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 67f1bc9e9..6cdbdd623 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -10,8 +10,10 @@ #include #include #include +#include "util/time.h" #include "util/shim.h" #include "lm/language_model.h" +#include "logging/logger.h" namespace meta { @@ -22,7 +24,12 @@ language_model::language_model(const cpptoml::table& config) { auto table = config.get_table("language-model"); auto arpa_file = table->get_as("arpa-file"); - read_arpa_format(*arpa_file); + LOG(info) << "Loading language model from .arpa file... " << ENDLG; + auto time = common::time([&]() + { + read_arpa_format(*arpa_file); + }); + LOG(info) << "Done. (" << time.count() << "ms)" << ENDLG; } void language_model::read_arpa_format(const std::string& arpa_file) @@ -45,7 +52,6 @@ void language_model::read_arpa_format(const std::string& arpa_file) } N_ = 0; - lm_.emplace_back(count[N_]); // add current n-value data while (std::getline(infile, buffer)) { @@ -54,8 +60,7 @@ void language_model::read_arpa_format(const std::string& arpa_file) if (buffer[0] == '\\') { - ++N_; - lm_.emplace_back(count[N_]); // add current n-value data + lm_.emplace_back(count[N_++]); // add current n-value data continue; } From f819e3d7841d01effae18586c0bdcef1f6b95ec7 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 2 Jun 2015 21:09:49 -0500 Subject: [PATCH 123/481] Fix accidental double-definition of ceeaus download URL. --- src/test/tools/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/tools/CMakeLists.txt b/src/test/tools/CMakeLists.txt index b17c3318b..47f226799 100644 --- a/src/test/tools/CMakeLists.txt +++ b/src/test/tools/CMakeLists.txt @@ -2,7 +2,6 @@ ExternalProject_Add(ceeaus SOURCE_DIR ${meta_BINARY_DIR}/../../data/ceeaus DOWNLOAD_DIR ${meta_BINARY_DIR}/downloads URL http://web.engr.illinois.edu/~massung1/files/ceeaus-metadata.tar.gz - URL http://web.engr.illinois.edu/~massung1/files/ceeaus.tar.gz URL_HASH "SHA256=8ea40b32f34e9ae8aedffe562ad468fc465d1cc0ff6a5c3bdf0ee42bb85c231e" CONFIGURE_COMMAND "" BUILD_COMMAND "" From f98ed59c2439be766bd80dc348b3719c169428dd Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 9 Jun 2015 13:04:05 -0500 Subject: [PATCH 124/481] sanity check for length_filter min and max --- src/analyzers/filters/length_filter.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/analyzers/filters/length_filter.cpp b/src/analyzers/filters/length_filter.cpp index baebad4cb..4a23f4456 100644 --- a/src/analyzers/filters/length_filter.cpp +++ b/src/analyzers/filters/length_filter.cpp @@ -20,6 +20,9 @@ length_filter::length_filter(std::unique_ptr source, uint64_t min, uint64_t max) : source_{std::move(source)}, min_length_{min}, max_length_{max} { + using exception = token_stream::token_stream_exception; + if (min_length_ > max_length_) + throw exception{"min filter length is greater than max filter length"}; next_token(); } From 1454fea1a779f98d2aad24b01fe01f118256ccb1 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 9 Jun 2015 13:12:20 -0500 Subject: [PATCH 125/481] document filters re: #91 --- include/analyzers/filters/alpha_filter.h | 3 +++ include/analyzers/filters/empty_sentence_filter.h | 3 +++ include/analyzers/filters/english_normalizer.h | 3 +++ include/analyzers/filters/icu_filter.h | 9 +++++++++ include/analyzers/filters/length_filter.h | 8 ++++++++ include/analyzers/filters/list_filter.h | 10 ++++++++++ include/analyzers/filters/lowercase_filter.h | 3 +++ include/analyzers/filters/porter2_stemmer.h | 3 +++ include/analyzers/filters/ptb_normalizer.h | 3 +++ include/analyzers/filters/sentence_boundary.h | 9 +++++++++ 10 files changed, 54 insertions(+) diff --git a/include/analyzers/filters/alpha_filter.h b/include/analyzers/filters/alpha_filter.h index a1f2d0fa2..bb6bfbaeb 100644 --- a/include/analyzers/filters/alpha_filter.h +++ b/include/analyzers/filters/alpha_filter.h @@ -23,6 +23,9 @@ namespace filters /** * Filter that removes "non-letter" characters from tokens. "Letterness" is * determined by the Unicode properties of each codepoint in the token. + * + * Required config parameters: none. + * Optional config parameters: none. */ class alpha_filter : public util::clonable { diff --git a/include/analyzers/filters/empty_sentence_filter.h b/include/analyzers/filters/empty_sentence_filter.h index cf4020c86..61403f4d4 100644 --- a/include/analyzers/filters/empty_sentence_filter.h +++ b/include/analyzers/filters/empty_sentence_filter.h @@ -23,6 +23,9 @@ namespace filters * Filter that removes any empty sentences from the token stream. Empty * sentences can be caused by filters in the filter chain that follow * sentence boundary detection. + * + * Required config parameters: none. + * Optional config parameters: none. */ class empty_sentence_filter : public util::clonable diff --git a/include/analyzers/filters/english_normalizer.h b/include/analyzers/filters/english_normalizer.h index 0b26e8a62..2e506e7e0 100644 --- a/include/analyzers/filters/english_normalizer.h +++ b/include/analyzers/filters/english_normalizer.h @@ -27,6 +27,9 @@ namespace filters * whitespace (adjacent whitespace tokens are converted to a single * normalized space token) and punctuation (which is split out from words * following basic heuristics). + * + * Required config parameters: none. + * Optional config parameters: none. */ class english_normalizer : public util::clonable diff --git a/include/analyzers/filters/icu_filter.h b/include/analyzers/filters/icu_filter.h index a63468a61..dceb128df 100644 --- a/include/analyzers/filters/icu_filter.h +++ b/include/analyzers/filters/icu_filter.h @@ -29,6 +29,15 @@ namespace filters /** * Filter that applies an ICU transliteration to each token in the * sequence. + * + * Required config parameters: + * ```toml + * id = "transformer" + * ``` + * + * Optional config parameters: none. + * + * @see http://userguide.icu-project.org/transforms/general/rules */ class icu_filter : public util::clonable { diff --git a/include/analyzers/filters/length_filter.h b/include/analyzers/filters/length_filter.h index 893df4cab..e8bab2939 100644 --- a/include/analyzers/filters/length_filter.h +++ b/include/analyzers/filters/length_filter.h @@ -30,6 +30,14 @@ namespace filters /** * Filter that only retains tokens that are within a certain length range, * inclusive. + * + * Required config parameters: + * ```toml + * min = 2 # any integer + * max = 32 # any integer >= min + * ``` + * + * Optional config parameters: none. */ class length_filter : public util::clonable { diff --git a/include/analyzers/filters/list_filter.h b/include/analyzers/filters/list_filter.h index 68ab4fad8..bf9b70acc 100644 --- a/include/analyzers/filters/list_filter.h +++ b/include/analyzers/filters/list_filter.h @@ -30,6 +30,16 @@ namespace filters /** * Filter that either removes or keeps tokens from a given list. + * + * Required config parameters: + * ```toml + * file = "path" + * ``` + * Optional config parameters: + * ```toml + * type = "accept" # or, + * type = "reject" # default + * ``` */ class list_filter : public util::clonable { diff --git a/include/analyzers/filters/lowercase_filter.h b/include/analyzers/filters/lowercase_filter.h index 9ddc17cc8..0985a08e0 100644 --- a/include/analyzers/filters/lowercase_filter.h +++ b/include/analyzers/filters/lowercase_filter.h @@ -22,6 +22,9 @@ namespace filters /** * Filter that converts all tokens to lowercase. + * + * Required config parameters: none. + * Optional config parameters: none. */ class lowercase_filter : public util::clonable { diff --git a/include/analyzers/filters/porter2_stemmer.h b/include/analyzers/filters/porter2_stemmer.h index 3dd1b4ee9..65e8cb5e3 100644 --- a/include/analyzers/filters/porter2_stemmer.h +++ b/include/analyzers/filters/porter2_stemmer.h @@ -24,6 +24,9 @@ namespace filters /** * Filter that stems words according to the porter2 stemmer algorithm. * Requires that the porter2 stemmer project submodule be downloaded. + * + * Required config parameters: none. + * Optional config parameters: none. */ class porter2_stemmer : public util::clonable { diff --git a/include/analyzers/filters/ptb_normalizer.h b/include/analyzers/filters/ptb_normalizer.h index d5734b6e3..bb8c7358a 100644 --- a/include/analyzers/filters/ptb_normalizer.h +++ b/include/analyzers/filters/ptb_normalizer.h @@ -25,6 +25,9 @@ namespace filters * A filter that normalizes text to match Penn Treebank conventions. This * is important as a preprocessing step for input to POS taggers and * parsers that were trained on Penn Treebank formatted data. + * + * Required config parameters: none. + * Optional config parameters: none. */ class ptb_normalizer : public util::clonable { diff --git a/include/analyzers/filters/sentence_boundary.h b/include/analyzers/filters/sentence_boundary.h index 48a0c30d4..26dbcccf8 100644 --- a/include/analyzers/filters/sentence_boundary.h +++ b/include/analyzers/filters/sentence_boundary.h @@ -33,6 +33,15 @@ namespace filters * Filter that adds sentence boundary tokens ("" and "") to streams of * tokens. This filter requires that whitespace and punctuation be present * in the source stream. + * + * Required config parameters: + * ```toml + * punctuation = "path" + * start-exceptions = "path" + * end-exceptions = "path" + * ``` + * + * Optional config parameters: none. */ class sentence_boundary : public util::clonable { From 33f69c412486d6719450ab45792c8f6db7118ac9 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 9 Jun 2015 13:25:22 -0500 Subject: [PATCH 126/481] document user-side analyzers re: #91 --- include/analyzers/ngram/ngram_word_analyzer.h | 12 ++++++++++++ include/parser/analyzers/tree_analyzer.h | 14 ++++++++++++++ include/sequence/analyzers/ngram_pos_analyzer.h | 15 +++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/include/analyzers/ngram/ngram_word_analyzer.h b/include/analyzers/ngram/ngram_word_analyzer.h index 8611c7d7b..5280f4082 100644 --- a/include/analyzers/ngram/ngram_word_analyzer.h +++ b/include/analyzers/ngram/ngram_word_analyzer.h @@ -20,6 +20,18 @@ namespace analyzers /** * Analyzes documents using their tokenized words. + * + * Required config parameters: + * ```toml + * [[analyzers]] + * method = "ngram-word" # this analyzer + * ngram = 1 # integer required + * filter = "default-chain" # filter type required + * ``` + * + * Optional config parameters: none. + * + * @see https://meta-toolkit.org/analyzers-filters-tutorial.html */ class ngram_word_analyzer : public util::multilevel_clonable { diff --git a/include/sequence/analyzers/ngram_pos_analyzer.h b/include/sequence/analyzers/ngram_pos_analyzer.h index c2809ef36..d9d27b674 100644 --- a/include/sequence/analyzers/ngram_pos_analyzer.h +++ b/include/sequence/analyzers/ngram_pos_analyzer.h @@ -27,6 +27,21 @@ namespace analyzers * other filters added. This tokenizer should be used to ensure that capital * letters and such may be used as features. Function words and stop words * should *not* be removed and words should not be stemmed for the same reason. + * + * Required config parameters: + * ```toml + * [[analyzers]] + * method = "ngram-pos" # this analyzer + * ngram = 1 # integer required + * crf-prefix = "path" + * [[analyzers.filter]] + * type = "icu-tokenizer" # recommended + * ``` + * + * Optional config parameters: none. + * + * @see https://meta-toolkit.org/analyzers-filters-tutorial.html + */ class ngram_pos_analyzer : public util::multilevel_clonable Date: Sun, 14 Jun 2015 16:21:51 +0300 Subject: [PATCH 127/481] Added building instructions for Fedora 20+ --- README.md | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/README.md b/README.md index 34d4a6970..e0b5e0b18 100644 --- a/README.md +++ b/README.md @@ -318,6 +318,90 @@ ctest --output-on-failure If everything passes, congratulations! MeTA seems to be working on your system. + + +## Fedora Build Guide + +This has been tested with Fedora 20+. You may have success with earlier versions, but this is not tested. + +To get started, install some dependencies: + +```bash +# These may be already installed +sudo yum install make git wget + +# libicu-devel is probably not installed by default +sudo yum install g++ libicu-devel +``` + +Now, you will need [cmake](http://www.cmake.org/) to compile the toolkit. `cmake` 2.8 is available in Fedora repos, but a newer version (3.1) is required for this project. +Install cmake 3.1 with the following commands: + +``` +wget http://www.cmake.org/files/v3.1/cmake-3.1.1-Linux-x86_64.sh +sudo sh cmake-3.1.1-Linux-x86_64.sh --prefix=/usr/local +``` + +During CMake installation, you should agree to the license and then say "n" +to including the subdirectory. You should be able to run the following +commands and see the following output: + +```bash +g++ --version +``` + +should print + +``` +g++ (GCC) 4.8.3 20140911 (Red Hat 4.8.3-7) +Copyright (C) 2013 Free Software Foundation, Inc. +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE +``` + +and + +```bash +/usr/local/bin/cmake --version +``` + +should print + +``` +cmake version 3.1.1 + +CMake suite maintained and supported by Kitware (kitware.com/cmake). +``` + +Once the dependencies are all installed, you should be ready to build. Run +the following commands to get started: + +``` +# clone the project +git clone https://github.com/meta-toolkit/meta.git +cd meta/ + +# set up submodules +git submodule update --init --recursive + +# set up a build directory +mkdir build +cd build +cp ../config.toml . + +# configure and build the project +CXX=g++ /usr/local/bin/cmake ../ -DCMAKE_BUILD_TYPE=Release +make +``` + +You can now test the system with the following command: + +```bash +ctest --output-on-failure +``` + + + ## EWS/EngrIT Build Guide If you are on a machine managed by Engineering IT at UIUC, you should follow this guide. These systems have software that is much too old for From 0dadca5cbf90632890a665b256670a9c4d8ed349 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 14 Jun 2015 11:04:12 -0500 Subject: [PATCH 128/481] link to Fedora setup guide in README; text formatting --- README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e0b5e0b18..be61b37b9 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ about MeTA! - [Mac OS X](#mac-os-x-build-guide) - [Ubuntu](#ubuntu-build-guide) - [Arch Linux](#arch-linux-build-guide) + - [Fedora](#fedora-build-guide) - [EWS/EngrIT](#ewsengrit-build-guide) (this is UIUC-specific) - [Generic Setup Notes](#generic-setup-notes) @@ -322,7 +323,8 @@ system. ## Fedora Build Guide -This has been tested with Fedora 20+. You may have success with earlier versions, but this is not tested. +This has been tested with Fedora 20+. You may have success with earlier +versions, but this is not tested. To get started, install some dependencies: @@ -331,11 +333,12 @@ To get started, install some dependencies: sudo yum install make git wget # libicu-devel is probably not installed by default -sudo yum install g++ libicu-devel +sudo yum install g++ libicu-devel ``` - -Now, you will need [cmake](http://www.cmake.org/) to compile the toolkit. `cmake` 2.8 is available in Fedora repos, but a newer version (3.1) is required for this project. -Install cmake 3.1 with the following commands: + +Now, you will need [cmake](http://www.cmake.org/) to compile the toolkit. +`cmake` 2.8 is available in Fedora repos, but a newer version (3.1) is required +for this project. Install cmake 3.1 with the following commands: ``` wget http://www.cmake.org/files/v3.1/cmake-3.1.1-Linux-x86_64.sh @@ -373,7 +376,7 @@ cmake version 3.1.1 CMake suite maintained and supported by Kitware (kitware.com/cmake). ``` -Once the dependencies are all installed, you should be ready to build. Run +Once the dependencies are all installed, you should be ready to build. Run the following commands to get started: ``` @@ -391,7 +394,7 @@ cp ../config.toml . # configure and build the project CXX=g++ /usr/local/bin/cmake ../ -DCMAKE_BUILD_TYPE=Release -make +make ``` You can now test the system with the following command: From 0f03bcf5ad560ba48d698f7d3ce3b9c7508d476a Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 14 Jun 2015 14:51:15 -0500 Subject: [PATCH 129/481] Force path to ceeaus qrels in ir-eval tests. Before, it was reading the qrels path from the config file, making the tests seemingly randomly break when you used a different set of qrels for something unrealted, and then went back to run the tests... --- src/test/inverted_index_test.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/test/inverted_index_test.cpp b/src/test/inverted_index_test.cpp index f8e1da375..aa3ab7aeb 100644 --- a/src/test/inverted_index_test.cpp +++ b/src/test/inverted_index_test.cpp @@ -24,10 +24,6 @@ void create_config(const std::string& corpus_type) if (!libsvm_modules) throw std::runtime_error{"\"libsvm-modules\" not in config"}; - auto query_judgements = orig_config.get_as("query-judgements"); - if (!query_judgements) - throw std::runtime_error{"\"query-judgements\" not in config"}; - auto punctuation = orig_config.get_as("punctuation"); if (!punctuation) throw std::runtime_error{"\"punctuation\" not in config"}; @@ -46,7 +42,7 @@ void create_config(const std::string& corpus_type) << "end-exceptions = \"" << *end_exceptions << "\"\n" << "prefix = \"" << *orig_config.get_as("prefix") << "\"\n" - << "query-judgements = \"" << *query_judgements << "\"\n" + << "query-judgements = \"../data/ceeaus-qrels.txt\"\n" << "libsvm-modules = \"" << *libsvm_modules << "\"\n" << "dataset = \"ceeaus\"\n" << "corpus = \"" << corpus_type << ".toml\"\n" From ca06861c7400923a06d5f4ca0a4d457540f7ed42 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 14 Jun 2015 03:16:23 -0500 Subject: [PATCH 130/481] Add profiling support to CMakeLists.txt. --- CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7cc864453..9e3857b7c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.0.0) project(meta) option(USE_LIBCXX "Use libc++ for the C++ standard library" ON) +option(ENABLE_PROFILING "Link against gperftools profiler library" OFF) set(CMAKE_EXPORT_COMPILE_COMMANDS 1) @@ -148,6 +149,12 @@ if(LIBCXX_LIBRARY) target_link_libraries(meta-definitions INTERFACE -L${LIBCXX_LIB_PATH}) endif() +if(ENABLE_PROFILING) + find_library(GPERFTOOLS_PROFILER NAMES profiler REQUIRED) + message("-- Found profiler: ${GPERFTOOLS_PROFILER}") + target_link_libraries(meta-definitions INTERFACE ${GPERFTOOLS_PROFILER}) +endif() + if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") target_compile_definitions(meta-definitions INTERFACE -D_DARWIN_USE_64_BIT_INODE=1) From 6ec0b9e7beacaadae852b6941de364eea90f65a0 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 14 Jun 2015 16:41:46 -0500 Subject: [PATCH 131/481] Change utf::segmenter to use UText instead of icu::UnicodeString. This saves us the time going from utf8 -> utf16 and back after doing the segmentation; it turns out that the icu::BreakIterator hierarchy can actually segment utf8 text directly using a read-only UText view over the data. We now only have to copy the data into the segmenter as utf8 and don't create any temporary icu::UnicodeStrings anymore. --- src/utf/segmenter.cpp | 46 ++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/src/utf/segmenter.cpp b/src/utf/segmenter.cpp index 040e94ba4..98115d860 100644 --- a/src/utf/segmenter.cpp +++ b/src/utf/segmenter.cpp @@ -59,8 +59,8 @@ class segmenter::impl * Copy constructs an impl. * @param other The impl to copy. */ - impl(const impl& other) - : u_str_{other.u_str_}, + impl(const impl& other) : + text_{other.text_}, sentence_iter_{other.sentence_iter_->clone()}, word_iter_{other.word_iter_->clone()} { @@ -76,26 +76,17 @@ class segmenter::impl */ void set_content(const std::string& str) { - u_str_ = icu::UnicodeString::fromUTF8(str); + text_ = str; } /** - * Obtains a utf-8 encoded string by first extracting the utf-16 - * encoded substring between the given indices and converting that - * substring to utf-8. - * * @param begin The beginning index * @param end The ending index * @return the substring between begin and end */ std::string substr(int32_t begin, int32_t end) const { -#ifdef META_ICU_NO_TEMP_SUBSTRING - icu::UnicodeString substring{u_str_, begin, end - begin}; -#else - auto substring = u_str_.tempSubStringBetween(begin, end); -#endif - return icu_to_u8str(substring); + return text_.substr(begin, end - begin); } /** @@ -113,7 +104,7 @@ class segmenter::impl */ std::vector sentences() const { - return segments(0, u_str_.length(), segment_t::SENTENCES); + return segments(0, text_.length(), segment_t::SENTENCES); } /** @@ -122,7 +113,7 @@ class segmenter::impl */ std::vector words() const { - return segments(0, u_str_.length(), segment_t::WORDS); + return segments(0, text_.length(), segment_t::WORDS); } /** @@ -139,7 +130,6 @@ class segmenter::impl segment_t type) const { std::vector results; - auto status = U_ZERO_ERROR; icu::BreakIterator* iter; if (type == segment_t::SENTENCES) iter = sentence_iter_.get(); @@ -148,19 +138,24 @@ class segmenter::impl else throw std::runtime_error{"Unknown segmentation type"}; + auto status = U_ZERO_ERROR; + UText utxt = UTEXT_INITIALIZER; + utext_openUTF8(&utxt, text_.c_str() + first, last - first, &status); if (!U_SUCCESS(status)) { - std::string err = "Failed to segment: "; + std::string err = "Failed to open UText: "; err += u_errorName(status); throw std::runtime_error{err}; } -#ifdef META_ICU_NO_TEMP_SUBSTRING - icu::UnicodeString substring{u_str_, first, last - first}; - iter->setText(substring); -#else - iter->setText(u_str_.tempSubStringBetween(first, last)); -#endif + iter->setText(&utxt, status); + if (!U_SUCCESS(status)) + { + utext_close(&utxt); + std::string err = "Failed to setText: "; + err += u_errorName(status); + throw std::runtime_error{err}; + } auto start = iter->first(); auto end = iter->next(); @@ -170,12 +165,13 @@ class segmenter::impl start = end; end = iter->next(); } + utext_close(&utxt); return results; } private: - /// The internal ICU string - icu::UnicodeString u_str_; + /// The utf8 string we are segmenting + std::string text_; /// A pointer to a sentence break iterator std::unique_ptr sentence_iter_; /// A pointer to a word break iterator From e1d5363bff019b5b7c0eb4e2942cbde3417819a6 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 14 Jun 2015 16:48:20 -0500 Subject: [PATCH 132/481] Switch to returning a reference to PrimaryKey in postings_data. When PrimaryKey == std::string, returning by value was causing a ton of string copies to be made for simple things like operator<() and operator==(), which have no business copying the id. --- include/index/postings_data.h | 2 +- include/index/postings_data.tcc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/index/postings_data.h b/include/index/postings_data.h index a492a8f28..268100956 100644 --- a/include/index/postings_data.h +++ b/include/index/postings_data.h @@ -148,7 +148,7 @@ class postings_data /** * @return the term_id for this postings_data */ - PrimaryKey primary_key() const; + const PrimaryKey& primary_key() const; /** * @param new_key diff --git a/include/index/postings_data.tcc b/include/index/postings_data.tcc index e7071d47a..41cf6be26 100644 --- a/include/index/postings_data.tcc +++ b/include/index/postings_data.tcc @@ -113,7 +113,7 @@ bool operator==(const postings_data& lhs, } template -PrimaryKey postings_data::primary_key() const +const PrimaryKey& postings_data::primary_key() const { return p_id_; } From b9a5a774f1a59e072e8b1dacad39e8c5d4f4abb6 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 14 Jun 2015 16:54:08 -0500 Subject: [PATCH 133/481] Change remove_if to a function template. We don't need to make a std::function here since we aren't storing it and don't need the type erasure features at all. --- include/utf/utf.h | 37 +++++++++++++++++++++++++++++++++++-- src/utf/detail.h | 16 +--------------- src/utf/utf.cpp | 17 ----------------- 3 files changed, 36 insertions(+), 34 deletions(-) diff --git a/include/utf/utf.h b/include/utf/utf.h index 39dbf3ed5..a9b96f6aa 100644 --- a/include/utf/utf.h +++ b/include/utf/utf.h @@ -13,11 +13,29 @@ #include #include +#include + namespace meta { namespace utf { +/** + * Helper method that appends a UTF-32 codepoint to the given utf8 string. + * @param dest The string to append the codepoint to + * @param codepoint The UTF-32 codepoint to append + */ +inline void utf8_append_codepoint(std::string& dest, uint32_t codepoint) +{ + std::array buf; + int32_t len = 0; + UBool err = FALSE; + U8_APPEND(&buf[0], len, U8_MAX_LENGTH, codepoint, err); + if (err) + throw std::runtime_error{"failed to add codepoint to string"}; + dest.append(reinterpret_cast(&buf[0]), len); +} + /** * Converts a string from the given charset to utf8. * @param str The string to convert @@ -94,8 +112,23 @@ std::string transform(const std::string& str, const std::string& id); * @return a utf8 formatted string with all codepoints matching pred * removed */ -std::string remove_if(const std::string& str, - std::function pred); +template +std::string remove_if(const std::string& str, Predicate&& pred) +{ + std::string result; + result.reserve(str.size()); + const char* s = str.c_str(); + int32_t length = str.length(); + for (int32_t i = 0; i < length;) + { + UChar32 codepoint; + U8_NEXT(s, i, length, codepoint); + if (pred(codepoint)) + continue; + utf8_append_codepoint(result, codepoint); + } + return result; +} /** * @return the number of code points in a utf8 string. diff --git a/src/utf/detail.h b/src/utf/detail.h index 04b484e79..e03b63360 100644 --- a/src/utf/detail.h +++ b/src/utf/detail.h @@ -87,21 +87,7 @@ inline std::string icu_to_u8str(const icu::UnicodeString& icu_str) return u8str; } -/** - * Helper method that appends a UTF-32 codepoint to the given utf8 string. - * @param dest The string to append the codepoint to - * @param codepoint The UTF-32 codepoint to append - */ -inline void utf8_append_codepoint(std::string& dest, uint32_t codepoint) -{ - std::array buf; - int32_t len = 0; - UBool err = FALSE; - U8_APPEND(&buf[0], len, U8_MAX_LENGTH, codepoint, err); - if (err) - throw std::runtime_error{"failed to add codepoint to string"}; - dest.append(reinterpret_cast(&buf[0]), len); -} + } } #endif diff --git a/src/utf/utf.cpp b/src/utf/utf.cpp index ecc475264..5cb26144c 100644 --- a/src/utf/utf.cpp +++ b/src/utf/utf.cpp @@ -106,23 +106,6 @@ std::string foldcase(const std::string& str) return result; } -std::string remove_if(const std::string& str, - std::function pred) -{ - std::string result; - const char* s = str.c_str(); - int32_t length = str.length(); - for (int32_t i = 0; i < length;) - { - UChar32 codepoint; - U8_NEXT(s, i, length, codepoint); - if (pred(codepoint)) - continue; - utf8_append_codepoint(result, codepoint); - } - return result; -} - bool isalpha(uint32_t codepoint) { return u_isalpha(codepoint); From 0749b10ed0007c5e13e054101b3a26a4b1177c39 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 14 Jun 2015 16:58:47 -0500 Subject: [PATCH 134/481] Add a reserve() to icu_to_u8str(). The size is wrong, generally, but it seems to be improving the number of allocations we're making (since text is mostly ASCII). --- src/utf/detail.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/utf/detail.h b/src/utf/detail.h index e03b63360..e791b8ad5 100644 --- a/src/utf/detail.h +++ b/src/utf/detail.h @@ -83,6 +83,8 @@ inline std::u16string icu_to_u16str(const icu::UnicodeString& icu_str) inline std::string icu_to_u8str(const icu::UnicodeString& icu_str) { std::string u8str; + u8str.reserve(icu_str.length()); // this is not right in general, but is a + // reasonable guess for ascii icu_str.toUTF8String(u8str); return u8str; } From aac768a12fe0c33cff259b38128be48061f79070 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 14 Jun 2015 22:01:20 -0500 Subject: [PATCH 135/481] Add missing include. --- include/utf/utf.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/utf/utf.h b/include/utf/utf.h index a9b96f6aa..d60160fb9 100644 --- a/include/utf/utf.h +++ b/include/utf/utf.h @@ -10,6 +10,7 @@ #ifndef META_UTF8_H_ #define META_UTF8_H_ +#include #include #include From 1a535493e90ce7bd90606c77954a27bbee898c68 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 14 Jun 2015 23:21:14 -0500 Subject: [PATCH 136/481] Further reduce string allocations in indexing. - Replaced string copying in the utf::segmenter to use a util::string_view instead, eliminating a string copy for each word in the tokenized document. - Use move semantics to set_content on tokenizers/filters, which will save one string copy per document. --- include/analyzers/filters/alpha_filter.h | 2 +- .../analyzers/filters/empty_sentence_filter.h | 2 +- .../analyzers/filters/english_normalizer.h | 2 +- include/analyzers/filters/icu_filter.h | 2 +- include/analyzers/filters/length_filter.h | 2 +- include/analyzers/filters/list_filter.h | 2 +- include/analyzers/filters/lowercase_filter.h | 2 +- include/analyzers/filters/porter2_stemmer.h | 2 +- include/analyzers/filters/ptb_normalizer.h | 2 +- include/analyzers/filters/sentence_boundary.h | 2 +- include/analyzers/token_stream.h | 2 +- .../tokenizers/character_tokenizer.h | 2 +- include/analyzers/tokenizers/icu_tokenizer.h | 2 +- .../tokenizers/whitespace_tokenizer.h | 2 +- include/utf/segmenter.h | 10 +- include/util/hash.h | 226 +++++++ include/util/string_view.h | 617 ++++++++++++++++++ src/analyzers/filters/alpha_filter.cpp | 4 +- .../filters/empty_sentence_filter.cpp | 4 +- src/analyzers/filters/english_normalizer.cpp | 4 +- src/analyzers/filters/icu_filter.cpp | 4 +- src/analyzers/filters/length_filter.cpp | 4 +- src/analyzers/filters/list_filter.cpp | 4 +- src/analyzers/filters/lowercase_filter.cpp | 4 +- src/analyzers/filters/porter2_stemmer.cpp | 4 +- src/analyzers/filters/ptb_normalizer.cpp | 4 +- src/analyzers/filters/sentence_boundary.cpp | 4 +- .../tokenizers/character_tokenizer.cpp | 4 +- src/analyzers/tokenizers/icu_tokenizer.cpp | 8 +- .../tokenizers/whitespace_tokenizer.cpp | 4 +- src/analyzers/tools/tokenize_test.cpp | 2 +- src/lm/language_model.cpp | 2 +- src/sequence/crf/tools/pos_tag.cpp | 2 +- src/utf/segmenter.cpp | 14 +- 34 files changed, 902 insertions(+), 55 deletions(-) create mode 100644 include/util/hash.h create mode 100644 include/util/string_view.h diff --git a/include/analyzers/filters/alpha_filter.h b/include/analyzers/filters/alpha_filter.h index bb6bfbaeb..2931b8464 100644 --- a/include/analyzers/filters/alpha_filter.h +++ b/include/analyzers/filters/alpha_filter.h @@ -46,7 +46,7 @@ class alpha_filter : public util::clonable * Sets the content for the beginning of the filter chain. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * Obtains the next token in the sequence. diff --git a/include/analyzers/filters/empty_sentence_filter.h b/include/analyzers/filters/empty_sentence_filter.h index 61403f4d4..a219a6877 100644 --- a/include/analyzers/filters/empty_sentence_filter.h +++ b/include/analyzers/filters/empty_sentence_filter.h @@ -48,7 +48,7 @@ class empty_sentence_filter * Sets the content for the beginning of the filter chain. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * Obtains the next token in the sequence. diff --git a/include/analyzers/filters/english_normalizer.h b/include/analyzers/filters/english_normalizer.h index 2e506e7e0..3544288e9 100644 --- a/include/analyzers/filters/english_normalizer.h +++ b/include/analyzers/filters/english_normalizer.h @@ -52,7 +52,7 @@ class english_normalizer * Sets the content for the beginning of the filter chain. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * Obtains the next token in the sequence. diff --git a/include/analyzers/filters/icu_filter.h b/include/analyzers/filters/icu_filter.h index dceb128df..875ce20ca 100644 --- a/include/analyzers/filters/icu_filter.h +++ b/include/analyzers/filters/icu_filter.h @@ -60,7 +60,7 @@ class icu_filter : public util::clonable * Sets the content for the beginning of the filter chain. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * @return the next token in the sequence. diff --git a/include/analyzers/filters/length_filter.h b/include/analyzers/filters/length_filter.h index e8bab2939..4ac9d9b8d 100644 --- a/include/analyzers/filters/length_filter.h +++ b/include/analyzers/filters/length_filter.h @@ -63,7 +63,7 @@ class length_filter : public util::clonable * Sets the content for the beginning of the filter chain. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * @return the next token in the sequence diff --git a/include/analyzers/filters/list_filter.h b/include/analyzers/filters/list_filter.h index bf9b70acc..f94a5a770 100644 --- a/include/analyzers/filters/list_filter.h +++ b/include/analyzers/filters/list_filter.h @@ -78,7 +78,7 @@ class list_filter : public util::clonable * Sets the content for the beginning of the filter chain. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * @return the next token in the sequence. diff --git a/include/analyzers/filters/lowercase_filter.h b/include/analyzers/filters/lowercase_filter.h index 0985a08e0..d00fd1aab 100644 --- a/include/analyzers/filters/lowercase_filter.h +++ b/include/analyzers/filters/lowercase_filter.h @@ -46,7 +46,7 @@ class lowercase_filter : public util::clonable * Sets the content for the beginning of the filter chain. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * Obtains the next token in the sequence. diff --git a/include/analyzers/filters/porter2_stemmer.h b/include/analyzers/filters/porter2_stemmer.h index 65e8cb5e3..0c7fced33 100644 --- a/include/analyzers/filters/porter2_stemmer.h +++ b/include/analyzers/filters/porter2_stemmer.h @@ -48,7 +48,7 @@ class porter2_stemmer : public util::clonable * Sets the content for the beginning of the filter chain. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * Obtains the next token in the sequence. diff --git a/include/analyzers/filters/ptb_normalizer.h b/include/analyzers/filters/ptb_normalizer.h index bb8c7358a..d51a7f8d8 100644 --- a/include/analyzers/filters/ptb_normalizer.h +++ b/include/analyzers/filters/ptb_normalizer.h @@ -49,7 +49,7 @@ class ptb_normalizer : public util::clonable * Sets the content for the beginning of the filter chain. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * Obtains the next token in the sequence. diff --git a/include/analyzers/filters/sentence_boundary.h b/include/analyzers/filters/sentence_boundary.h index 26dbcccf8..f5969d10e 100644 --- a/include/analyzers/filters/sentence_boundary.h +++ b/include/analyzers/filters/sentence_boundary.h @@ -69,7 +69,7 @@ class sentence_boundary : public util::clonable * Sets the content for the beginning of the filter chain. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * @return the next token in the sequence. diff --git a/include/analyzers/token_stream.h b/include/analyzers/token_stream.h index f9d146a6e..5fca98d83 100644 --- a/include/analyzers/token_stream.h +++ b/include/analyzers/token_stream.h @@ -42,7 +42,7 @@ class token_stream * Sets the content for the stream. * @param content The string content to set */ - virtual void set_content(const std::string& content) = 0; + virtual void set_content(std::string&& content) = 0; /** * Destructor. diff --git a/include/analyzers/tokenizers/character_tokenizer.h b/include/analyzers/tokenizers/character_tokenizer.h index 943fc6106..15f0773fe 100644 --- a/include/analyzers/tokenizers/character_tokenizer.h +++ b/include/analyzers/tokenizers/character_tokenizer.h @@ -44,7 +44,7 @@ class character_tokenizer * Sets the content for the tokenizer. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * @return the next token in the document. This token will contain a diff --git a/include/analyzers/tokenizers/icu_tokenizer.h b/include/analyzers/tokenizers/icu_tokenizer.h index 7391c2844..3706ec8c7 100644 --- a/include/analyzers/tokenizers/icu_tokenizer.h +++ b/include/analyzers/tokenizers/icu_tokenizer.h @@ -92,7 +92,7 @@ class icu_tokenizer : public util::clonable * output as utf-8 encoded strings. * @param content The string content to set */ - void set_content(const std::string& content) override; + void set_content(std::string&& content) override; /** * @return the next token in the document. This will either by a diff --git a/include/analyzers/tokenizers/whitespace_tokenizer.h b/include/analyzers/tokenizers/whitespace_tokenizer.h index 3ab94d15d..be7ca7fe3 100644 --- a/include/analyzers/tokenizers/whitespace_tokenizer.h +++ b/include/analyzers/tokenizers/whitespace_tokenizer.h @@ -45,7 +45,7 @@ class whitespace_tokenizer : public util::clonable #include "util/optional.h" #include "util/pimpl.h" +#include "util/string_view.h" namespace meta { @@ -88,11 +89,14 @@ class segmenter ~segmenter(); /** - * Resets the content of the segmenter to the given string. + * Resets the content of the segmenter to the given string. The + * segmenter *does not own the data it operates over*, rather, the + * caller is responsible for guaranteeing that the string does not + * invalidate while the segmenter is acting over it. * * @param str A utf-8 string that should be segmented */ - void set_content(const std::string& str); + void set_content(util::string_view str); /** * Segments the current content into sentences by following the @@ -125,7 +129,7 @@ class segmenter * encoded string * @param seg the segment to get content for */ - std::string content(const segment& seg) const; + util::string_view content(const segment& seg) const; private: class impl; diff --git a/include/util/hash.h b/include/util/hash.h new file mode 100644 index 000000000..b75a90046 --- /dev/null +++ b/include/util/hash.h @@ -0,0 +1,226 @@ +/** + * @file hash.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_UTIL_HASH_H_ +#define META_UTIL_HASH_H_ + +#include + +namespace meta +{ +namespace util +{ + +/** + * Implementation of MurmurHash3. Depending on the template parameter, it + * will return a 32-bit or 64-bit hash value. + */ +template +struct murmur_hash; + +namespace +{ +inline uint32_t rotl(uint32_t x, int8_t r) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl(uint64_t x, int8_t r) +{ + return (x << r) | (x >> (64 - r)); +} + +inline uint32_t fmix(uint32_t h) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +inline uint64_t fmix(uint64_t h) +{ + h ^= h >> 33; + h *= 0xff51afd7ed558ccdLLU; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53LLU; + h ^= h >> 33; + + return h; +} +} + +/** + * Murmur3Hash for 32-bit outputs. Based on MurmurHash3_x86_32. + */ +template <> +struct murmur_hash<4> +{ + constexpr murmur_hash() = default; + + std::size_t operator()(const uint8_t* data, int len, uint32_t seed) + { + std::size_t out = seed; + + const auto nblocks = len / 4; + + constexpr uint32_t c1 = 0xcc9e2d51; + constexpr uint32_t c2 = 0x1b873593; + + auto blocks = reinterpret_cast(data + nblocks * 4); + + for (int i = -nblocks; i; ++i) + { + auto k1 = blocks[i]; + + k1 *= c1; + k1 = rotl(k1, 15); + k1 *= c2; + + out ^= k1; + out = rotl(out, 13); + out = out * 5 + 0xe6546b64; + } + + const uint8_t* tail = data + nblocks * 4; + + uint32_t k1 = 0; + switch (len & 3) + { + case 3: + k1 ^= tail[2] << 16; + case 2: + k1 ^= tail[1] << 8; + case 1: + k1 ^= tail[0]; + k1 *= c1; + k1 = rotl(k1, 15); + k1 *= c2; + out ^= k1; + } + + out ^= len; + + return fmix(out); + } +}; + +/** + * MurmurHash3 for 64-bit outputs. Based on MurmurHash3_x64_128. + */ +template <> +struct murmur_hash<8> +{ + constexpr murmur_hash() = default; + + std::size_t operator()(const uint8_t* data, int len, uint64_t seed) + { + const auto nblocks = len / 16; + + auto h1 = seed; + auto h2 = seed; + + const uint64_t c1 = 0x87c37b91114253d5LLU; + const uint64_t c2 = 0x4cf5ad432745937fLLU; + + auto blocks = reinterpret_cast(data); + + for (int i = 0; i < nblocks; ++i) + { + auto k1 = blocks[i * 2]; + auto k2 = blocks[i * 2 + 1]; + + k1 *= c1; + k1 = rotl(k1, 31); + k1 *= c2; + h1 ^= k1; + + h1 = rotl(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + + k2 *= c2; + k2 = rotl(k2, 33); + k2 *= c1; + h2 ^= k2; + + h2 = rotl(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + } + + auto tail = data + nblocks * 16; + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch (len & 15) + { + case 15: + k2 ^= static_cast(tail[14]) << 48; + case 14: + k2 ^= static_cast(tail[13]) << 40; + case 13: + k2 ^= static_cast(tail[12]) << 32; + case 12: + k2 ^= static_cast(tail[11]) << 24; + case 11: + k2 ^= static_cast(tail[10]) << 16; + case 10: + k2 ^= static_cast(tail[9]) << 8; + case 9: + k2 ^= static_cast(tail[8]); + k2 *= c2; + k2 = rotl(k2, 33); + k2 *= c1; + h2 ^= k2; + + case 8: + k1 ^= static_cast(tail[7]) << 56; + case 7: + k1 ^= static_cast(tail[6]) << 48; + case 6: + k1 ^= static_cast(tail[5]) << 40; + case 5: + k1 ^= static_cast(tail[4]) << 32; + case 4: + k1 ^= static_cast(tail[3]) << 24; + case 3: + k1 ^= static_cast(tail[2]) << 16; + case 2: + k1 ^= static_cast(tail[1]) << 8; + case 1: + k1 ^= static_cast(tail[0]); + k1 *= c1; + k1 = rotl(k1, 31); + k1 *= c2; + h1 ^= k1; + } + + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + + h1 += h2; + // h2 += h1, unneeded since we only want 64-bits. + + return h1; + } +}; +} +} +#endif diff --git a/include/util/string_view.h b/include/util/string_view.h new file mode 100644 index 000000000..cf45a92ec --- /dev/null +++ b/include/util/string_view.h @@ -0,0 +1,617 @@ +/** + * @file string_view.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_UTIL_STRING_VIEW_H_ +#define META_UTIL_STRING_VIEW_H_ + +#include +#include +#include + +#include "util/hash.h" + +namespace meta +{ +namespace util +{ + +/** + * A non-owning reference to a string. I make no claims that this is + * completely standards-compliant---this is just a best-effort attempt at + * implementing what we need for MeTA. I have built this using its paper's + * wording for the Fundamentals TS. + * + * @see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n3921.html + */ +template > +class basic_string_view +{ + public: + using traits_type = Traits; + using value_type = Char; + using pointer = Char*; + using const_pointer = const Char*; + using reference = Char&; + using const_reference = const Char&; + using const_iterator = const_pointer; + using iterator = const_iterator; + using const_reverse_iterator = std::reverse_iterator; + using reverse_iterator = const_reverse_iterator; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + static constexpr size_type npos = size_type(-1); + + constexpr basic_string_view() noexcept : data_{nullptr}, size_{0} + { + // nothing + } + + constexpr basic_string_view(const basic_string_view&) noexcept = default; + basic_string_view& operator=(const basic_string_view&) noexcept = default; + + template + basic_string_view( + const std::basic_string& str) noexcept + : data_{str.data()}, + size_{str.size()} + { + // nothing + } + + constexpr basic_string_view(const Char* str) + : data_{str}, size_{Traits::length(str)} + { + // nothing + } + + constexpr basic_string_view(const Char* str, size_type len) + : data_{str}, size_{len} + { + // nothing + } + + constexpr const_iterator begin() const noexcept + { + return data_; + } + + constexpr const_iterator end() const noexcept + { + return data_ + size_; + } + + constexpr const_iterator cbegin() const noexcept + { + return begin(); + } + + constexpr const_iterator cend() const noexcept + { + return end(); + } + + const_reverse_iterator rbegin() const noexcept + { + return {end()}; + } + + const_reverse_iterator rend() const noexcept + { + return {begin()}; + } + + const_reverse_iterator crbegin() const noexcept + { + return rbegin(); + } + + const_reverse_iterator crend() const noexcept + { + return rend(); + } + + constexpr size_type size() const noexcept + { + return size_; + } + + constexpr size_type length() const noexcept + { + return size(); + } + + constexpr size_type max_size() const noexcept + { + return size(); + } + + constexpr bool empty() const noexcept + { + return size() == 0; + } + + constexpr const_reference operator[](size_type pos) const + { + return data_[pos]; + } + + constexpr const_reference at(size_type pos) const + { + if (pos >= size()) + throw std::out_of_range{"index out of bounds"}; + return data_[pos]; + } + + constexpr const_reference front() const + { + return data_[0]; + } + + constexpr const_reference back() const + { + return data_[size_ - 1]; + } + + constexpr const_pointer data() const noexcept + { + return data_; + } + + constexpr void clear() noexcept + { + data_ = nullptr; + size_ = 0; + } + + constexpr void remove_prefix(size_type n) + { + data_ += n; + size_ -= n; + } + + constexpr void remove_suffix(size_type n) + { + size_ -= n; + } + + constexpr void swap(basic_string_view& s) noexcept + { + using ::std::swap; + swap(data_, s.data_); + swap(size_, s.size_); + } + + template + explicit operator std::basic_string() const + { + return {begin(), end()}; + } + + template > + std::basic_string to_string(const Allocator& a + = Allocator{}) const + { + return {begin(), end(), a}; + } + + size_type copy(Char* s, size_type n, size_type pos = 0) const + { + if (pos > size()) + throw std::out_of_range{"index out of bounds"}; + + auto rlen = std::min(n, size() - pos); + std::copy_n(begin() + pos, rlen, s); + return rlen; + } + + constexpr basic_string_view substr(size_type pos = 0, + size_type n = npos) const + { + return pos > size() + ? throw std::out_of_range{"index out of bounds"} + : basic_string_view{data() + pos, std::min(n, size() - pos)}; + } + + constexpr int compare(basic_string_view s) const noexcept + { + constexpr auto rlen = std::min(size(), s.size()); + return Traits::compare(data(), s.data(), rlen); + } + + constexpr int compare(size_type pos1, size_type n1, + basic_string_view s) const + { + return substr(pos1, n1).compare(s); + } + + constexpr int compare(size_type pos1, size_type n1, basic_string_view s, + size_type pos2, size_type n2) const + { + return substr(pos1, n1).compare(s.substr(pos2, n2)); + } + + constexpr int compare(const Char* s) const + { + return compare(basic_string_view{s}); + } + + constexpr int compare(size_type pos1, size_type n1, const Char* s) const + { + return substr(pos1, n1).compare(basic_string_view{s}); + } + + constexpr int compare(size_type pos1, size_type n1, const Char* s, + size_type n2) const + { + return substr(pos1, n1).compare(basic_string_view{s, n2}); + } + + constexpr size_type find(basic_string_view s, size_type pos = 0) const + noexcept + { + if (pos >= size()) + return npos; + + auto it + = std::search(begin() + pos, end(), s.begin(), s.end(), Traits::eq); + if (it == end()) + return npos; + return std::distance(begin(), it); + } + + constexpr size_type find(Char c, size_type pos = 0) const noexcept + { + return find(basic_string_view{&c, 1}, pos); + } + + constexpr size_type find(const Char* s, size_type pos, size_type n) const + { + return find(basic_string_view{s, n}, pos); + } + + constexpr size_type find(const Char* s, size_type pos = 0) const + { + return find(basic_string_view{s}, pos); + } + + constexpr size_type rfind(basic_string_view s, size_type pos = npos) const + noexcept + { + if (size() < s.size()) + return npos; + + pos = std::min(pos, size()); + if (s.size() < size() - pos) + pos += s.size(); + else + pos = size(); + + auto it = std::find_end(begin(), begin() + pos, s.begin(), s.end(), + Traits::eq); + + if (it == begin() + pos) + return npos; + return std::distance(begin(), it); + } + + constexpr size_type rfind(Char c, size_type pos = npos) const noexcept + { + return rfind(basic_string_view{&c, 1}, pos); + } + + constexpr size_type rfind(const Char* s, size_type pos, size_type n) const + { + return rfind(basic_string_view{s, n}, pos); + } + + constexpr size_type rfind(const Char* s, size_type pos = npos) const + { + return rfind(basic_string_view{s}, pos); + } + + constexpr size_type find_first_of(basic_string_view s, + size_type pos = 0) const noexcept + { + if (pos >= size()) + return npos; + + auto it = std::find_first_of(begin() + pos, end(), s.begin(), s.end(), + Traits::eq); + if (it == end()) + return npos; + return std::distance(begin(), it); + } + + constexpr size_type find_first_of(Char c, size_type pos = 0) const noexcept + { + return find_first_of(basic_string_view{&c, 1}, pos); + } + + constexpr size_type find_first_of(const Char* s, size_type pos, + size_type n) const + { + return find_first_of(basic_string_view{s, n}, pos); + } + + constexpr size_type find_first_of(const Char* s, size_type pos = 0) const + { + return find_first_of(basic_string_view{s}, pos); + } + + constexpr size_type find_last_of(basic_string_view s, + size_type pos = npos) const noexcept + { + if (pos >= size()) + return npos; + + auto diff = size() - std::min(size(), pos); + auto it = std::find_first_of(rbegin() + diff, rend(), s.begin(), + s.end(), Traits::eq); + if (it == rend()) + return npos; + return size() - 1 - std::distance(rbegin(), it); + } + + constexpr size_type find_last_of(Char c, size_type pos = npos) const + noexcept + { + return find_last_of(basic_string_view{&c, 1}, pos); + } + + constexpr size_type find_last_of(const Char* s, size_type pos, + size_type n) const + { + return find_last_of(basic_string_view{s, n}, pos); + } + + constexpr size_type find_last_of(const Char* s, size_type pos = npos) const + { + return find_last_of(basic_string_view{s}, pos); + } + + constexpr size_type find_first_not_of(basic_string_view s, + size_type pos = 0) const noexcept + { + if (pos >= size()) + return npos; + + auto it = std::find_if(begin(), end(), [&](const_reference c) + { + return std::find(s.begin(), s.end(), c, + Traits::eq) == s.end(); + }); + if (it == end()) + return npos; + return std::distance(begin(), it); + } + + constexpr size_type find_first_not_of(Char c, size_type pos = 0) const + noexcept + { + return find_first_not_of(basic_string_view{&c, 1}, pos); + } + + constexpr size_type find_first_not_of(const Char* s, size_type pos, + size_type n) const + { + return find_first_not_of(basic_string_view{s, n}, pos); + } + + constexpr size_type find_first_not_of(const Char* s, + size_type pos = 0) const + { + return find_first_not_of(basic_string_view{s}, pos); + } + + constexpr size_type find_last_not_of(basic_string_view s, + size_type pos = npos) const noexcept + { + if (pos >= size()) + return npos; + + auto diff = size() - std::min(size(), pos); + auto it = std::find_if(rbegin() + diff, rend(), [&](const_reference c) + { + return std::find(s.begin(), s.end(), c, + Traits::eq) == s.end(); + }); + if (it == rend()) + return npos; + return size() - 1 - std::distance(rbegin(), it); + } + + constexpr size_type find_last_not_of(Char c, size_type pos = npos) const + noexcept + { + return find_last_not_of(basic_string_view{&c, 1}, pos); + } + + constexpr size_type find_last_not_of(const Char* s, size_type pos, + size_type n) const + { + return find_last_not_of(basic_string_view{s, n}, pos); + } + + constexpr size_type find_last_not_of(const Char* s, + size_type pos = npos) const + { + return find_last_not_of(basic_string_view{s}, pos); + } + + private: + const_pointer data_; + size_type size_; +}; + +using string_view = basic_string_view; +using u16string_view = basic_string_view; +using u32string_view = basic_string_view; +using wstring_view = basic_string_view; + +namespace +{ +template +using identity = typename std::decay::type; +} + +template +constexpr bool operator==(basic_string_view lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) == 0; +} + +template +constexpr bool + operator==(basic_string_view lhs, + identity> rhs) noexcept +{ + return lhs.compare(rhs) == 0; +} + +template +constexpr bool operator==(identity> lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) == 0; +} + +template +constexpr bool operator!=(basic_string_view lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) != 0; +} + +template +constexpr bool + operator!=(basic_string_view lhs, + identity> rhs) noexcept +{ + return lhs.compare(rhs) != 0; +} + +template +constexpr bool operator!=(identity> lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) != 0; +} + +template +constexpr bool operator<(basic_string_view lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) < 0; +} + +template +constexpr bool operator<(basic_string_view lhs, + identity> rhs) noexcept +{ + return lhs.compare(rhs) < 0; +} + +template +constexpr bool operator<(identity> lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) < 0; +} + +template +constexpr bool operator>(basic_string_view lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) > 0; +} + +template +constexpr bool operator>(basic_string_view lhs, + identity> rhs) noexcept +{ + return lhs.compare(rhs) > 0; +} + +template +constexpr bool operator>(identity> lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) > 0; +} + +template +constexpr bool operator<=(basic_string_view lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) <= 0; +} + +template +constexpr bool + operator<=(basic_string_view lhs, + identity> rhs) noexcept +{ + return lhs.compare(rhs) <= 0; +} + +template +constexpr bool operator<=(identity> lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) <= 0; +} + +template +constexpr bool operator>=(basic_string_view lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) >= 0; +} + +template +constexpr bool + operator>=(basic_string_view lhs, + identity> rhs) noexcept +{ + return lhs.compare(rhs) >= 0; +} + +template +constexpr bool operator>=(identity> lhs, + basic_string_view rhs) noexcept +{ + return lhs.compare(rhs) >= 0; +} + +template +std::basic_ostream& + operator<<(std::basic_ostream& os, + basic_string_view str) +{ + return os << str.to_string(); +} +} +} + +namespace std +{ + +template +struct hash> +{ + size_t operator()( + const meta::util::basic_string_view& view) const noexcept + { + static constexpr meta::util::murmur_hash<> hasher{}; + return hasher(view.data(), view.size()); + } +}; +} +#endif diff --git a/src/analyzers/filters/alpha_filter.cpp b/src/analyzers/filters/alpha_filter.cpp index 5dff8447e..8820b2929 100644 --- a/src/analyzers/filters/alpha_filter.cpp +++ b/src/analyzers/filters/alpha_filter.cpp @@ -28,9 +28,9 @@ alpha_filter::alpha_filter(const alpha_filter& other) // nothing } -void alpha_filter::set_content(const std::string& content) +void alpha_filter::set_content(std::string&& content) { - source_->set_content(content); + source_->set_content(std::move(content)); next_token(); } diff --git a/src/analyzers/filters/empty_sentence_filter.cpp b/src/analyzers/filters/empty_sentence_filter.cpp index 1d9a57471..e93ec6bd9 100644 --- a/src/analyzers/filters/empty_sentence_filter.cpp +++ b/src/analyzers/filters/empty_sentence_filter.cpp @@ -29,9 +29,9 @@ empty_sentence_filter::empty_sentence_filter(const empty_sentence_filter& other) // nothing } -void empty_sentence_filter::set_content(const std::string& content) +void empty_sentence_filter::set_content(std::string&& content) { - source_->set_content(content); + source_->set_content(std::move(content)); first_ = second_ = util::nullopt; next_token(); } diff --git a/src/analyzers/filters/english_normalizer.cpp b/src/analyzers/filters/english_normalizer.cpp index 5bf837773..75936b098 100644 --- a/src/analyzers/filters/english_normalizer.cpp +++ b/src/analyzers/filters/english_normalizer.cpp @@ -28,10 +28,10 @@ english_normalizer::english_normalizer(const english_normalizer& other) // nothing } -void english_normalizer::set_content(const std::string& content) +void english_normalizer::set_content(std::string&& content) { tokens_.clear(); - source_->set_content(content); + source_->set_content(std::move(content)); } std::string english_normalizer::next() diff --git a/src/analyzers/filters/icu_filter.cpp b/src/analyzers/filters/icu_filter.cpp index 099dbed11..d51e5abe4 100644 --- a/src/analyzers/filters/icu_filter.cpp +++ b/src/analyzers/filters/icu_filter.cpp @@ -30,9 +30,9 @@ icu_filter::icu_filter(const icu_filter& other) // nothing } -void icu_filter::set_content(const std::string& content) +void icu_filter::set_content(std::string&& content) { - source_->set_content(content); + source_->set_content(std::move(content)); next_token(); } diff --git a/src/analyzers/filters/length_filter.cpp b/src/analyzers/filters/length_filter.cpp index 4a23f4456..7016677ed 100644 --- a/src/analyzers/filters/length_filter.cpp +++ b/src/analyzers/filters/length_filter.cpp @@ -35,10 +35,10 @@ length_filter::length_filter(const length_filter& other) // nothing } -void length_filter::set_content(const std::string& content) +void length_filter::set_content(std::string&& content) { token_ = util::nullopt; - source_->set_content(content); + source_->set_content(std::move(content)); next_token(); } diff --git a/src/analyzers/filters/list_filter.cpp b/src/analyzers/filters/list_filter.cpp index c24e3d363..878c873a8 100644 --- a/src/analyzers/filters/list_filter.cpp +++ b/src/analyzers/filters/list_filter.cpp @@ -40,10 +40,10 @@ list_filter::list_filter(const list_filter& other) // nothing } -void list_filter::set_content(const std::string& content) +void list_filter::set_content(std::string&& content) { token_ = util::nullopt; - source_->set_content(content); + source_->set_content(std::move(content)); next_token(); } diff --git a/src/analyzers/filters/lowercase_filter.cpp b/src/analyzers/filters/lowercase_filter.cpp index e36acd46d..28354ad8a 100644 --- a/src/analyzers/filters/lowercase_filter.cpp +++ b/src/analyzers/filters/lowercase_filter.cpp @@ -29,9 +29,9 @@ lowercase_filter::lowercase_filter(const lowercase_filter& other) // nothing } -void lowercase_filter::set_content(const std::string& content) +void lowercase_filter::set_content(std::string&& content) { - source_->set_content(content); + source_->set_content(std::move(content)); } std::string lowercase_filter::next() diff --git a/src/analyzers/filters/porter2_stemmer.cpp b/src/analyzers/filters/porter2_stemmer.cpp index 9e8c79271..d1cbdb2ba 100644 --- a/src/analyzers/filters/porter2_stemmer.cpp +++ b/src/analyzers/filters/porter2_stemmer.cpp @@ -27,9 +27,9 @@ porter2_stemmer::porter2_stemmer(const porter2_stemmer& other) // nothing } -void porter2_stemmer::set_content(const std::string& content) +void porter2_stemmer::set_content(std::string&& content) { - source_->set_content(content); + source_->set_content(std::move(content)); next_token(); } diff --git a/src/analyzers/filters/ptb_normalizer.cpp b/src/analyzers/filters/ptb_normalizer.cpp index a6d5a42f7..3e79c46b4 100644 --- a/src/analyzers/filters/ptb_normalizer.cpp +++ b/src/analyzers/filters/ptb_normalizer.cpp @@ -28,10 +28,10 @@ ptb_normalizer::ptb_normalizer(const ptb_normalizer& other) // nothing } -void ptb_normalizer::set_content(const std::string& content) +void ptb_normalizer::set_content(std::string&& content) { tokens_.clear(); - source_->set_content(content); + source_->set_content(std::move(content)); } std::string ptb_normalizer::next() diff --git a/src/analyzers/filters/sentence_boundary.cpp b/src/analyzers/filters/sentence_boundary.cpp index fbaddc79c..894cb37be 100644 --- a/src/analyzers/filters/sentence_boundary.cpp +++ b/src/analyzers/filters/sentence_boundary.cpp @@ -40,12 +40,12 @@ sentence_boundary::sentence_boundary(const sentence_boundary& other) // nothing } -void sentence_boundary::set_content(const std::string& content) +void sentence_boundary::set_content(std::string&& content) { tokens_.clear(); tokens_.emplace_back(""); prev_ = util::nullopt; - source_->set_content(content); + source_->set_content(std::move(content)); } void sentence_boundary::load_heuristics(const cpptoml::table& config) diff --git a/src/analyzers/tokenizers/character_tokenizer.cpp b/src/analyzers/tokenizers/character_tokenizer.cpp index 41b7b2455..aed68269d 100644 --- a/src/analyzers/tokenizers/character_tokenizer.cpp +++ b/src/analyzers/tokenizers/character_tokenizer.cpp @@ -21,10 +21,10 @@ character_tokenizer::character_tokenizer() : idx_{0} // nothing } -void character_tokenizer::set_content(const std::string& content) +void character_tokenizer::set_content(std::string&& content) { idx_ = 0; - content_ = content; + content_ = std::move(content); } std::string character_tokenizer::next() diff --git a/src/analyzers/tokenizers/icu_tokenizer.cpp b/src/analyzers/tokenizers/icu_tokenizer.cpp index b0c392743..d354822cb 100644 --- a/src/analyzers/tokenizers/icu_tokenizer.cpp +++ b/src/analyzers/tokenizers/icu_tokenizer.cpp @@ -69,11 +69,11 @@ class icu_tokenizer::impl // check first character, if it's whitespace skip it UChar32 codepoint; - U8_GET_UNSAFE(wrd.c_str(), 0, codepoint); + U8_GET_UNSAFE(wrd.data(), 0, codepoint); if (u_isUWhiteSpace(codepoint)) continue; - tokens_.emplace_back(std::move(wrd)); + tokens_.emplace_back(wrd.to_string()); } if (!suppress_tags_) tokens_.emplace_back(""); @@ -131,9 +131,9 @@ icu_tokenizer::icu_tokenizer(const icu_tokenizer& other) : impl_{*other.impl_} icu_tokenizer::~icu_tokenizer() = default; -void icu_tokenizer::set_content(const std::string& content) +void icu_tokenizer::set_content(std::string&& content) { - impl_->set_content(content); + impl_->set_content(std::move(content)); } std::string icu_tokenizer::next() diff --git a/src/analyzers/tokenizers/whitespace_tokenizer.cpp b/src/analyzers/tokenizers/whitespace_tokenizer.cpp index 2b13fed76..b7887f722 100644 --- a/src/analyzers/tokenizers/whitespace_tokenizer.cpp +++ b/src/analyzers/tokenizers/whitespace_tokenizer.cpp @@ -23,9 +23,9 @@ whitespace_tokenizer::whitespace_tokenizer() : idx_{0} { } -void whitespace_tokenizer::set_content(const std::string& content) +void whitespace_tokenizer::set_content(std::string&& content) { - content_ = content; + content_ = std::move(content); idx_ = 0; } diff --git a/src/analyzers/tools/tokenize_test.cpp b/src/analyzers/tools/tokenize_test.cpp index db9c6e3b3..975fd7887 100644 --- a/src/analyzers/tools/tokenize_test.cpp +++ b/src/analyzers/tools/tokenize_test.cpp @@ -56,7 +56,7 @@ int main(int argc, char** argv) if (line.empty()) break; - stream->set_content(line); + stream->set_content(std::move(line)); while (*stream) { std::cout << stream->next(); diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 236cf10d2..8d0d5cf97 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -66,7 +66,7 @@ void language_model::learn_model(const std::string& config_file) while (corpus->has_next()) { auto doc = corpus->next(); - stream->set_content(doc.content()); + stream->set_content(analyzer::get_content(doc)); // get ngram stream started std::deque ngram; diff --git a/src/sequence/crf/tools/pos_tag.cpp b/src/sequence/crf/tools/pos_tag.cpp index a28edeeee..945ffd27d 100644 --- a/src/sequence/crf/tools/pos_tag.cpp +++ b/src/sequence/crf/tools/pos_tag.cpp @@ -59,7 +59,7 @@ int main(int argc, char* argv[]) std::unique_ptr stream = make_unique(); - stream->set_content(line); + stream->set_content(std::move(line)); sequence::sequence seq; while (*stream) { diff --git a/src/utf/segmenter.cpp b/src/utf/segmenter.cpp index 98115d860..e93283ccc 100644 --- a/src/utf/segmenter.cpp +++ b/src/utf/segmenter.cpp @@ -74,7 +74,7 @@ class segmenter::impl * Sets the content of the segmenter. * @param str The content to be set */ - void set_content(const std::string& str) + void set_content(util::string_view str) { text_ = str; } @@ -84,7 +84,7 @@ class segmenter::impl * @param end The ending index * @return the substring between begin and end */ - std::string substr(int32_t begin, int32_t end) const + util::string_view substr(int32_t begin, int32_t end) const { return text_.substr(begin, end - begin); } @@ -140,7 +140,7 @@ class segmenter::impl auto status = U_ZERO_ERROR; UText utxt = UTEXT_INITIALIZER; - utext_openUTF8(&utxt, text_.c_str() + first, last - first, &status); + utext_openUTF8(&utxt, text_.data() + first, last - first, &status); if (!U_SUCCESS(status)) { std::string err = "Failed to open UText: "; @@ -170,8 +170,8 @@ class segmenter::impl } private: - /// The utf8 string we are segmenting - std::string text_; + /// A view over the utf8 string we are segmenting + util::string_view text_; /// A pointer to a sentence break iterator std::unique_ptr sentence_iter_; /// A pointer to a word break iterator @@ -197,7 +197,7 @@ segmenter::segmenter(const segmenter& other) : impl_{*other.impl_} segmenter::~segmenter() = default; -void segmenter::set_content(const std::string& str) +void segmenter::set_content(util::string_view str) { impl_->set_content(str); } @@ -217,7 +217,7 @@ auto segmenter::words(const segment& seg) const -> std::vector return impl_->segments(seg.begin_, seg.end_, impl::segment_t::WORDS); } -std::string segmenter::content(const segment& seg) const +util::string_view segmenter::content(const segment& seg) const { return impl_->substr(seg.begin_, seg.end_); } From 519bee613d9642a8a8e472f39841712004a1d852 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 14 Jun 2015 23:25:55 -0500 Subject: [PATCH 137/481] Move tokens out when asking icu_tokenizer for the next(). --- src/analyzers/tokenizers/icu_tokenizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/analyzers/tokenizers/icu_tokenizer.cpp b/src/analyzers/tokenizers/icu_tokenizer.cpp index d354822cb..5c6a580c2 100644 --- a/src/analyzers/tokenizers/icu_tokenizer.cpp +++ b/src/analyzers/tokenizers/icu_tokenizer.cpp @@ -87,7 +87,7 @@ class icu_tokenizer::impl { if (!*this) throw token_stream_exception{"next() called with no tokens left"}; - auto result = tokens_.front(); + auto result = std::move(tokens_.front()); tokens_.pop_front(); return result; } From 9f0843d33936d6d68deb51b966cd2490af0bd4ee Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 15 Jun 2015 01:35:13 -0500 Subject: [PATCH 138/481] Eliminate string duplication by using move-semantics in filters. In a lot of places, we were assigning a token into an optional or something just using regular string assignment instead of forcing move assignment. This should eliminate a lot of unnecessary copies. --- src/analyzers/filters/alpha_filter.cpp | 6 +++--- src/analyzers/filters/length_filter.cpp | 6 +++--- src/analyzers/filters/list_filter.cpp | 6 +++--- src/analyzers/filters/lowercase_filter.cpp | 3 +-- src/analyzers/filters/porter2_stemmer.cpp | 2 +- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/analyzers/filters/alpha_filter.cpp b/src/analyzers/filters/alpha_filter.cpp index 8820b2929..d0f33e3cf 100644 --- a/src/analyzers/filters/alpha_filter.cpp +++ b/src/analyzers/filters/alpha_filter.cpp @@ -36,7 +36,7 @@ void alpha_filter::set_content(std::string&& content) std::string alpha_filter::next() { - auto tok = *token_; + auto tok = std::move(*token_); next_token(); return tok; } @@ -48,7 +48,7 @@ void alpha_filter::next_token() auto tok = source_->next(); if (tok == "" || tok == "") { - token_ = tok; + token_ = std::move(tok); return; } @@ -56,7 +56,7 @@ void alpha_filter::next_token() { return !utf::isalpha(codepoint) && codepoint != '\''; }); if (!filt.empty()) { - token_ = filt; + token_ = std::move(filt); return; } } diff --git a/src/analyzers/filters/length_filter.cpp b/src/analyzers/filters/length_filter.cpp index 7016677ed..40944180f 100644 --- a/src/analyzers/filters/length_filter.cpp +++ b/src/analyzers/filters/length_filter.cpp @@ -44,7 +44,7 @@ void length_filter::set_content(std::string&& content) std::string length_filter::next() { - auto tok = *token_; + auto tok = std::move(*token_); next_token(); return tok; } @@ -67,13 +67,13 @@ void length_filter::next_token() auto tok = source_->next(); if (tok == "" || tok == "") { - token_ = tok; + token_ = std::move(tok); return; } auto len = utf::length(tok); if (len >= min_length_ && len <= max_length_) { - token_ = tok; + token_ = std::move(tok); return; } } diff --git a/src/analyzers/filters/list_filter.cpp b/src/analyzers/filters/list_filter.cpp index 878c873a8..47bfe9b95 100644 --- a/src/analyzers/filters/list_filter.cpp +++ b/src/analyzers/filters/list_filter.cpp @@ -49,7 +49,7 @@ void list_filter::set_content(std::string&& content) std::string list_filter::next() { - auto tok = *token_; + auto tok = std::move(*token_); next_token(); return tok; } @@ -76,14 +76,14 @@ void list_filter::next_token() case type::ACCEPT: if (found) { - token_ = tok; + token_ = std::move(tok); return; } break; case type::REJECT: if (!found) { - token_ = tok; + token_ = std::move(tok); return; } break; diff --git a/src/analyzers/filters/lowercase_filter.cpp b/src/analyzers/filters/lowercase_filter.cpp index 28354ad8a..441371dec 100644 --- a/src/analyzers/filters/lowercase_filter.cpp +++ b/src/analyzers/filters/lowercase_filter.cpp @@ -36,8 +36,7 @@ void lowercase_filter::set_content(std::string&& content) std::string lowercase_filter::next() { - auto tok = source_->next(); - return utf::foldcase(tok); + return utf::foldcase(source_->next()); } lowercase_filter::operator bool() const diff --git a/src/analyzers/filters/porter2_stemmer.cpp b/src/analyzers/filters/porter2_stemmer.cpp index d1cbdb2ba..cf9a28870 100644 --- a/src/analyzers/filters/porter2_stemmer.cpp +++ b/src/analyzers/filters/porter2_stemmer.cpp @@ -48,7 +48,7 @@ void porter2_stemmer::next_token() Porter2Stemmer::stem(tok); if (!tok.empty()) { - token_ = tok; + token_ = std::move(tok); return; } } From bee4ff3408df83765923e63f7ea7d97d73fdebd5 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 15 Jun 2015 01:36:23 -0500 Subject: [PATCH 139/481] Refactor ngram_word_analyzer to not buffer all tokens. We only need to buffer N tokens. This reduces the amount of string copying further. --- src/analyzers/ngram/ngram_word_analyzer.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/analyzers/ngram/ngram_word_analyzer.cpp b/src/analyzers/ngram/ngram_word_analyzer.cpp index 229810a8e..6550c124e 100644 --- a/src/analyzers/ngram/ngram_word_analyzer.cpp +++ b/src/analyzers/ngram/ngram_word_analyzer.cpp @@ -33,20 +33,20 @@ ngram_word_analyzer::ngram_word_analyzer(const ngram_word_analyzer& other) void ngram_word_analyzer::tokenize(corpus::document& doc) { - // first, get tokens stream_->set_content(get_content(doc)); - std::vector tokens; + std::deque tokens; while (*stream_) - tokens.push_back(stream_->next()); - - // second, create ngrams from them - for (size_t i = n_value() - 1; i < tokens.size(); ++i) { - std::string combined = tokens[i]; - for (size_t j = 1; j < n_value(); ++j) - combined = tokens[i - j] + "_" + combined; + tokens.emplace_back(stream_->next()); + if (tokens.size() == n_value()) + { + auto combined = std::move(tokens.front()); + tokens.pop_front(); + for (const auto& token : tokens) + combined += "_" + token; - doc.increment(combined, 1); + doc.increment(combined, 1); + } } } From 936a0b862d417a6941720d84a680235e7f9ca7af Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 15 Jun 2015 19:11:40 -0500 Subject: [PATCH 140/481] create sr_parser interactive demo --- src/parser/tools/CMakeLists.txt | 3 ++ src/parser/tools/sr_parse.cpp | 87 +++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 src/parser/tools/sr_parse.cpp diff --git a/src/parser/tools/CMakeLists.txt b/src/parser/tools/CMakeLists.txt index 322c434cf..3593f8a64 100644 --- a/src/parser/tools/CMakeLists.txt +++ b/src/parser/tools/CMakeLists.txt @@ -1,6 +1,9 @@ add_executable(read-trees read_trees.cpp) target_link_libraries(read-trees meta-parser) +add_executable(sr_parse sr_parse.cpp) +target_link_libraries(sr_parse meta-parser meta-analyzers meta-greedy-tagger) + add_executable(parser-train parser_train.cpp) target_link_libraries(parser-train meta-parser meta-util) diff --git a/src/parser/tools/sr_parse.cpp b/src/parser/tools/sr_parse.cpp new file mode 100644 index 000000000..f02937a62 --- /dev/null +++ b/src/parser/tools/sr_parse.cpp @@ -0,0 +1,87 @@ +/** + * @file sr_parse.cpp + * @author Sean Massung + */ + +#include "analyzers/filters/all.h" +#include "analyzers/tokenizers/icu_tokenizer.h" +#include "parser/sr_parser.h" +#include "sequence/perceptron.h" +#include "sequence/io/ptb_parser.h" +#include "sequence/sequence.h" +#include "util/shim.h" +#include "cpptoml.h" + +using namespace meta; + +int main(int argc, char* argv[]) +{ + if (argc != 2) + { + std::cerr << "Usage:\t" << argv[0] << " configFile" << std::endl; + return 1; + } + + logging::set_cerr_logging(); + + auto config = cpptoml::parse_file(argv[1]); + auto seq_grp = config.get_table("sequence"); + if (!seq_grp) + throw std::runtime_error{"[sequence] group needed in config file"}; + + auto prefix = seq_grp->get_as("prefix"); + if (!prefix) + throw std::runtime_error{"[sequence] group needs a prefix key"}; + + auto parser_grp = config.get_table("parser"); + if (!parser_grp) + throw std::runtime_error{"[parser] group needed in config file"}; + + auto parser_prefix = parser_grp->get_as("prefix"); + if (!parser_prefix) + throw std::runtime_error{"[parser] group needs a prefix key"}; + + std::cout << "Loading tagging model" << std::endl; + sequence::perceptron tagger{*prefix}; + + std::cout << "Loading parser model" << std::endl; + parser::sr_parser parser{*parser_prefix}; + + std::unique_ptr stream + = make_unique(); + stream = make_unique(std::move(stream)); + + std::string line; + std::cout << "Type a sentence to have it parsed, blank to exit." + << std::endl; + while (true) + { + std::cout << " > "; + std::getline(std::cin, line); + + if (line.empty()) + break; + + sequence::sequence seq; + stream->set_content(std::move(line)); + while (*stream) + { + auto token = stream->next(); + if (token == "") + { + seq = {}; + } + else if (token == "") + { + tagger.tag(seq); + parser.parse(seq).pretty_print(std::cout); + } + else + { + seq.add_symbol(sequence::symbol_t{token}); + } + } + + std::cout << std::endl; + } +} From c6c577d6401c06740d676ff6a10dd771de65aa98 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 15 Jun 2015 20:03:18 -0500 Subject: [PATCH 141/481] fix code marker type in class comments --- include/analyzers/filters/icu_filter.h | 4 ++-- include/analyzers/filters/length_filter.h | 4 ++-- include/analyzers/filters/list_filter.h | 8 ++++---- include/analyzers/filters/sentence_boundary.h | 4 ++-- include/analyzers/ngram/ngram_word_analyzer.h | 4 ++-- include/parser/analyzers/tree_analyzer.h | 3 ++- include/sequence/analyzers/ngram_pos_analyzer.h | 4 ++-- 7 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/analyzers/filters/icu_filter.h b/include/analyzers/filters/icu_filter.h index 875ce20ca..03e0a5c53 100644 --- a/include/analyzers/filters/icu_filter.h +++ b/include/analyzers/filters/icu_filter.h @@ -31,9 +31,9 @@ namespace filters * sequence. * * Required config parameters: - * ```toml + * ~~~toml * id = "transformer" - * ``` + * ~~~ * * Optional config parameters: none. * diff --git a/include/analyzers/filters/length_filter.h b/include/analyzers/filters/length_filter.h index 4ac9d9b8d..4feed011b 100644 --- a/include/analyzers/filters/length_filter.h +++ b/include/analyzers/filters/length_filter.h @@ -32,10 +32,10 @@ namespace filters * inclusive. * * Required config parameters: - * ```toml + * ~~~toml * min = 2 # any integer * max = 32 # any integer >= min - * ``` + * ~~~ * * Optional config parameters: none. */ diff --git a/include/analyzers/filters/list_filter.h b/include/analyzers/filters/list_filter.h index f94a5a770..d03579f54 100644 --- a/include/analyzers/filters/list_filter.h +++ b/include/analyzers/filters/list_filter.h @@ -32,14 +32,14 @@ namespace filters * Filter that either removes or keeps tokens from a given list. * * Required config parameters: - * ```toml + * ~~~toml * file = "path" - * ``` + * ~~~ * Optional config parameters: - * ```toml + * ~~~toml * type = "accept" # or, * type = "reject" # default - * ``` + * ~~~ */ class list_filter : public util::clonable { diff --git a/include/analyzers/filters/sentence_boundary.h b/include/analyzers/filters/sentence_boundary.h index f5969d10e..1f2aa9a6c 100644 --- a/include/analyzers/filters/sentence_boundary.h +++ b/include/analyzers/filters/sentence_boundary.h @@ -35,11 +35,11 @@ namespace filters * in the source stream. * * Required config parameters: - * ```toml + * ~~~toml * punctuation = "path" * start-exceptions = "path" * end-exceptions = "path" - * ``` + * ~~~ * * Optional config parameters: none. */ diff --git a/include/analyzers/ngram/ngram_word_analyzer.h b/include/analyzers/ngram/ngram_word_analyzer.h index 5280f4082..6e5542ecd 100644 --- a/include/analyzers/ngram/ngram_word_analyzer.h +++ b/include/analyzers/ngram/ngram_word_analyzer.h @@ -22,12 +22,12 @@ namespace analyzers * Analyzes documents using their tokenized words. * * Required config parameters: - * ```toml + * ~~~toml * [[analyzers]] * method = "ngram-word" # this analyzer * ngram = 1 # integer required * filter = "default-chain" # filter type required - * ``` + * ~~~ * * Optional config parameters: none. * diff --git a/include/parser/analyzers/tree_analyzer.h b/include/parser/analyzers/tree_analyzer.h index df10ed7f7..41614a425 100644 --- a/include/parser/analyzers/tree_analyzer.h +++ b/include/parser/analyzers/tree_analyzer.h @@ -27,13 +27,14 @@ namespace analyzers * Base class tokenizing using parse tree features. * * Required config parameters: - * ```toml + * ~~~toml * [[analyzers]] * method = "tree" # this analyzer * filter = [{type = "icu-tokenizer"}, {type = "ptb-normalizer"}] # example * features = ["skel", "subtree"] # example * tagger = "path" * parser = "path" + * ~~~ * * Optional config parameters: none. * diff --git a/include/sequence/analyzers/ngram_pos_analyzer.h b/include/sequence/analyzers/ngram_pos_analyzer.h index d9d27b674..2547845cb 100644 --- a/include/sequence/analyzers/ngram_pos_analyzer.h +++ b/include/sequence/analyzers/ngram_pos_analyzer.h @@ -29,14 +29,14 @@ namespace analyzers * should *not* be removed and words should not be stemmed for the same reason. * * Required config parameters: - * ```toml + * ~~~toml * [[analyzers]] * method = "ngram-pos" # this analyzer * ngram = 1 # integer required * crf-prefix = "path" * [[analyzers.filter]] * type = "icu-tokenizer" # recommended - * ``` + * ~~~ * * Optional config parameters: none. * From 3f5c34a0df6f4257a3a4affe848ddf6d3c1b1648 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 15 Jun 2015 20:11:53 -0500 Subject: [PATCH 142/481] update default config file with corpus .toml file key --- config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.toml b/config.toml index b052ba5ad..488c2f4b0 100644 --- a/config.toml +++ b/config.toml @@ -8,8 +8,8 @@ end-exceptions = "../data/sentence-boundaries/sentence-end-exceptions.txt" query-judgements = "../data/ceeaus-qrels.txt" query-path = "../queries.txt" # create this file -corpus-type = "line-corpus" dataset = "20newsgroups" +corpus = "line.toml" # located inside dataset folder forward-index = "20news-fwd" inverted-index = "20news-inv" From 7fc5d34b5976152331e6c8b42917b596b1021c9a Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 15 Jun 2015 20:13:07 -0500 Subject: [PATCH 143/481] document corpus config options re: #91 --- include/corpus/corpus.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/corpus/corpus.h b/include/corpus/corpus.h index 6ebf6160b..e2bebb739 100644 --- a/include/corpus/corpus.h +++ b/include/corpus/corpus.h @@ -25,6 +25,20 @@ namespace corpus /** * Provides interface to with multiple corpus input formats. + * + * Required config parameters: + * ~~~toml + * prefix = "prefix" + * dataset = "datasetname" # relative to prefix + * corpus = "corpus-spec-file" # e.g. "line.toml" + * ~~~ + * + * The corpus spec toml file also requires a corpus type and an optional + * encoding for the corpus text. + * + * Optional config parameters: none. + * + * @see https://meta-toolkit.org/overview-tutorial.html */ class corpus { From f6ac7cbbcd5ea30f368e497a5ee8b90d0efb7790 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 15 Jun 2015 20:21:53 -0500 Subject: [PATCH 144/481] document ranker config options re: #91 --- include/index/ranker/absolute_discount.h | 11 +++++++++++ include/index/ranker/dirichlet_prior.h | 12 ++++++++++++ include/index/ranker/jelinek_mercer.h | 11 +++++++++++ include/index/ranker/okapi_bm25.h | 13 +++++++++++++ include/index/ranker/pivoted_length.h | 11 +++++++++++ 5 files changed, 58 insertions(+) diff --git a/include/index/ranker/absolute_discount.h b/include/index/ranker/absolute_discount.h index 007020e5a..fb768c78e 100644 --- a/include/index/ranker/absolute_discount.h +++ b/include/index/ranker/absolute_discount.h @@ -20,6 +20,17 @@ namespace index /** * Implements the absolute discounting smoothing method. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "absolute-discount" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * delta = 0.7 + * ~~~ */ class absolute_discount : public language_model_ranker { diff --git a/include/index/ranker/dirichlet_prior.h b/include/index/ranker/dirichlet_prior.h index c08291ad3..4058435ae 100644 --- a/include/index/ranker/dirichlet_prior.h +++ b/include/index/ranker/dirichlet_prior.h @@ -19,6 +19,18 @@ namespace index /** * Implements Bayesian smoothing with a Dirichlet prior. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "dirichlet-prior" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * mu = 2000.0 + * ~~~ + */ class dirichlet_prior : public language_model_ranker { diff --git a/include/index/ranker/jelinek_mercer.h b/include/index/ranker/jelinek_mercer.h index c0339e9a4..a0203f51d 100644 --- a/include/index/ranker/jelinek_mercer.h +++ b/include/index/ranker/jelinek_mercer.h @@ -22,6 +22,17 @@ namespace index * can be viewed as a linear interpolation between the query term probablity * and the collection term probability. The model parameter lambda is the * weighting of this interpolation. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "jelinek-mercer" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * lambda = 0.7 + * ~~~ */ class jelinek_mercer : public language_model_ranker { diff --git a/include/index/ranker/okapi_bm25.h b/include/index/ranker/okapi_bm25.h index b1542931d..93b371ef6 100644 --- a/include/index/ranker/okapi_bm25.h +++ b/include/index/ranker/okapi_bm25.h @@ -19,6 +19,19 @@ namespace index /** * The Okapi BM25 scoring function. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "bm25" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * k1 = 1.2 + * b = 0.75 + * k3 = 500.0 + * ~~~ */ class okapi_bm25 : public ranker { diff --git a/include/index/ranker/pivoted_length.h b/include/index/ranker/pivoted_length.h index 9e1930020..67a835d89 100644 --- a/include/index/ranker/pivoted_length.h +++ b/include/index/ranker/pivoted_length.h @@ -21,6 +21,17 @@ namespace index * The pivoted document length normalization ranking function * @see Amit Singal, Chris Buckley, and Mandar Mitra. Pivoted document length * normalization. SIGIR '96, pages 21-29. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "pivoted-length" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * s = 0.2 + * ~~~ */ class pivoted_length : public ranker { From 97b06c3e531cd61c91764bc85760883ca567d11d Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 15 Jun 2015 20:30:04 -0500 Subject: [PATCH 145/481] document LDA implementations re: #91 --- include/topics/lda_model.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/topics/lda_model.h b/include/topics/lda_model.h index cb9a2b5dc..938f2c586 100644 --- a/include/topics/lda_model.h +++ b/include/topics/lda_model.h @@ -21,6 +21,18 @@ namespace topics /** * An LDA topic model base class. + * + * Required config parameters (for use with the ./lda executable): + * ~~~toml + * inference = "inference-method" # gibbs, pargibbs, cvb, scvb + * max-iters = 1000 + * alpha = 1.0 + * beta = 1.0 + * topics = 4 + * model-prefix = "prefix" + * ~~~ + * + * Optional config parameters: none. */ class lda_model { From 07df86e6742516f88ac39c3a3c457809e62dfe2f Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 16 Jun 2015 13:24:10 -0500 Subject: [PATCH 146/481] add include for travis's version of g++ --- src/lm/sentence.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 1c3a0354a..71cee7a95 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "lm/sentence.h" #include "analyzers/analyzer.h" #include "analyzers/tokenizers/icu_tokenizer.h" From 74db84ee512a83323a08f9fde930c45782ba9174 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 16 Jun 2015 13:33:56 -0500 Subject: [PATCH 147/481] fix correct config path in lm unit test --- src/test/lm_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/lm_test.cpp b/src/test/lm_test.cpp index 9b0768348..48ac16e83 100644 --- a/src/test/lm_test.cpp +++ b/src/test/lm_test.cpp @@ -21,7 +21,7 @@ int lm_tests() num_failed += testing::run_test( "lm-test", [&]() { - lm::language_model model{cpptoml::parse_file("config.toml")}; + lm::language_model model{cpptoml::parse_file("test-config.toml")}; lm::sentence s1{ " I disagree with this statement for several reasons . ", false}; From 11f039f244c0eaccd39c8a6cf2f645fedeea3290 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 16 Jun 2015 13:39:03 -0500 Subject: [PATCH 148/481] remove generic lambdas from feature selection code --- src/features/feature_selector.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp index 19ef16727..fe21ea95b 100644 --- a/src/features/feature_selector.cpp +++ b/src/features/feature_selector.cpp @@ -58,9 +58,9 @@ void feature_selector::score_all() } prog.end(); - parallel::parallel_for(scores.begin(), scores.end(), [&](auto& v) + parallel::parallel_for(scores.begin(), scores.end(), [&](std::vector& v) { - std::sort(v.begin(), v.end(), [&](const auto& a, const auto& b) + std::sort(v.begin(), v.end(), [&](const pair_t& a, const pair_t& b) { return a.second > b.second; }); From a7e21013956236f8d127a45003aaa246245c3bfb Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 18 Jun 2015 19:01:31 -0500 Subject: [PATCH 149/481] fix issue #101 If using zlib, either compressed or uncompressed model files can be loaded. They may even be mixed and matched (some compressed, some not) if that happens to be the case. For saving, compression is chosen if zlib exists. --- include/parser/sr_parser.h | 6 ++++++ include/parser/transition_map.h | 7 +++++++ include/sequence/sequence_analyzer.h | 6 ++++++ src/parser/sr_parser.cpp | 15 ++++++++++++--- src/parser/transition_map.cpp | 15 ++++++++++++--- src/sequence/perceptron.cpp | 11 ++++++++--- src/sequence/sequence_analyzer.cpp | 14 +++++++++++--- 7 files changed, 62 insertions(+), 12 deletions(-) diff --git a/include/parser/sr_parser.h b/include/parser/sr_parser.h index 639cd7480..3fafa1854 100644 --- a/include/parser/sr_parser.h +++ b/include/parser/sr_parser.h @@ -268,6 +268,12 @@ class sr_parser best_transitions(const feature_vector& features, const state& state, size_t num, bool check_legality = false) const; + /** + * Loads the parser model file from the given stream. + * @param model The input stream to read from + */ + void load(std::istream& model); + /** * Storage for the ids for each transition */ diff --git a/include/parser/transition_map.h b/include/parser/transition_map.h index ff2189310..94603f6f4 100644 --- a/include/parser/transition_map.h +++ b/include/parser/transition_map.h @@ -75,6 +75,13 @@ class transition_map }; private: + + /** + * Loads the transitions from the given file. + * @param store The transitions model input stream + */ + void load(std::istream& store); + /** * The map from transition to id. */ diff --git a/include/sequence/sequence_analyzer.h b/include/sequence/sequence_analyzer.h index 11ee50dc5..a440de441 100644 --- a/include/sequence/sequence_analyzer.h +++ b/include/sequence/sequence_analyzer.h @@ -318,6 +318,12 @@ class sequence_analyzer */ void load_feature_id_mapping(const std::string& prefix); + /** + * Loads the feature_id mapping from disk using an input stream. + * @param input The input stream of the feature_id mapping + */ + void load_feature_id_mapping(std::istream& input); + /** * Loads the label_id mapping from disk. * @param prefix The folder to load the mapping from diff --git a/src/parser/sr_parser.cpp b/src/parser/sr_parser.cpp index 4fa992e20..7dd2378d1 100644 --- a/src/parser/sr_parser.cpp +++ b/src/parser/sr_parser.cpp @@ -16,6 +16,7 @@ #include "parser/trees/internal_node.h" #include "parser/trees/leaf_node.h" #include "parser/trees/visitors/debinarizer.h" +#include "util/filesystem.h" #include "util/progress.h" #include "util/range.h" #include "util/time.h" @@ -451,11 +452,19 @@ void sr_parser::save(const std::string& prefix) const void sr_parser::load(const std::string& prefix) { #ifdef META_HAS_ZLIB - io::gzifstream model{prefix + "/parser.model.gz"}; -#else - std::ifstream model{prefix + "/parser.model", std::ios::binary}; + if (filesystem::file_exists(prefix + "/parser.model.gz")) + { + io::gzifstream model{prefix + "/parser.model.gz"}; + load(model); + return; + } #endif + std::ifstream model{prefix + "/parser.model", std::ios::binary}; + load(model); +} +void sr_parser::load(std::istream& model) +{ if (!model) throw exception{"model file not found"}; diff --git a/src/parser/transition_map.cpp b/src/parser/transition_map.cpp index 9cbe3695c..ad406d38c 100644 --- a/src/parser/transition_map.cpp +++ b/src/parser/transition_map.cpp @@ -8,6 +8,7 @@ #include "io/binary.h" #include "parser/transition_map.h" +#include "util/filesystem.h" #ifdef META_HAS_ZLIB #include "io/gzstream.h" @@ -21,11 +22,19 @@ namespace parser transition_map::transition_map(const std::string& prefix) { #ifdef META_HAS_ZLIB - io::gzifstream store{prefix + "/parser.trans.gz"}; -#else - std::ifstream store{prefix + "/parser.trans", std::ios::binary}; + if (filesystem::file_exists(prefix + "/parser.trans.gz")) + { + io::gzifstream store{prefix + "/parser.trans.gz"}; + load(store); + return; + } #endif + std::ifstream store{prefix + "/parser.trans", std::ios::binary}; + load(store); +} +void transition_map::load(std::istream& store) +{ if (!store) throw exception{"missing transitions model file"}; diff --git a/src/sequence/perceptron.cpp b/src/sequence/perceptron.cpp index 77a09f119..eb994afb7 100644 --- a/src/sequence/perceptron.cpp +++ b/src/sequence/perceptron.cpp @@ -7,6 +7,7 @@ #include "sequence/perceptron.h" #include "utf/utf.h" +#include "util/filesystem.h" #include "util/progress.h" #include "util/time.h" @@ -51,10 +52,14 @@ perceptron::perceptron(const std::string& prefix) : perceptron() analyzer_.load(prefix); #if META_HAS_ZLIB - io::gzifstream file{prefix + "/tagger.model.gz"}; -#else - std::ifstream file{prefix + "/tagger.model"}; + if (filesystem::file_exists(prefix + "/tagger.model.gz")) + { + io::gzifstream file{prefix + "/tagger.model.gz"}; + model_.load(file); + return; + } #endif + std::ifstream file{prefix + "/tagger.model"}; model_.load(file); } diff --git a/src/sequence/sequence_analyzer.cpp b/src/sequence/sequence_analyzer.cpp index 0ea3e9bf7..e93108bf5 100644 --- a/src/sequence/sequence_analyzer.cpp +++ b/src/sequence/sequence_analyzer.cpp @@ -37,11 +37,19 @@ void sequence_analyzer::load(const std::string& prefix) void sequence_analyzer::load_feature_id_mapping(const std::string& prefix) { #if META_HAS_ZLIB - io::gzifstream input{prefix + "/feature.mapping.gz"}; -#else - std::ifstream input{prefix + "/feature.mapping", std::ios::binary}; + if (filesystem::file_exists(prefix + "/feature.mapping.gz")) + { + io::gzifstream input{prefix + "/feature.mapping.gz"}; + load_feature_id_mapping(input); + return; + } #endif + std::ifstream input{prefix + "/feature.mapping", std::ios::binary}; + load_feature_id_mapping(input); +} +void sequence_analyzer::load_feature_id_mapping(std::istream& input) +{ if (!input) throw exception{"missing feature id mapping"}; From eb5dc682b0eb10d40b42ba69550c378b7fcec627 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 18 Jun 2015 20:16:53 -0500 Subject: [PATCH 150/481] clang-format feature_selector --- src/features/feature_selector.cpp | 53 +++++++++++++++++-------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp index fe21ea95b..d397299f5 100644 --- a/src/features/feature_selector.cpp +++ b/src/features/feature_selector.cpp @@ -53,19 +53,19 @@ void feature_selector::score_all() { prog(tid); for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) - scores[lbl][tid] - = std::make_pair(tid, score(static_cast(lbl + 1), term_id{tid})); + scores[lbl][tid] = std::make_pair( + tid, score(static_cast(lbl + 1), term_id{tid})); } prog.end(); - parallel::parallel_for(scores.begin(), scores.end(), [&](std::vector& v) - { - std::sort(v.begin(), v.end(), [&](const pair_t& a, const pair_t& b) - { - return a.second > b.second; + parallel::parallel_for( + scores.begin(), scores.end(), [&](std::vector& v) + { + std::sort(v.begin(), v.end(), [&](const pair_t& a, const pair_t& b) + { + return a.second > b.second; + }); }); - }); - for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) { @@ -151,7 +151,8 @@ void feature_selector::print_summary(uint64_t k /* = 20 */) const double score; for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) { - std::cout << std::endl << "Top " << k << " features for \"" + std::cout << std::endl + << "Top " << k << " features for \"" << idx_->class_label_from_id(static_cast(lbl + 1)) << "\":" << std::endl << "===============================" << std::endl; @@ -171,24 +172,27 @@ void feature_selector::print_summary(uint64_t k /* = 20 */) const double feature_selector::prob_term(term_id id) const { auto p = term_prob_.at(id); - if(p < 0 || p > 1) - throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + if (p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + + std::to_string(p)}; return p; } double feature_selector::prob_class(label_id id) const { auto p = class_prob_.at(id - 1); - if(p < 0 || p > 1) - throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + if (p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + + std::to_string(p)}; return p; } double feature_selector::term_and_class(term_id term, label_id label) const { auto p = co_occur_.at(label - 1).at(term); - if(p < 0 || p > 1) - throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + if (p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + + std::to_string(p)}; return p; } @@ -196,25 +200,28 @@ double feature_selector::not_term_and_not_class(term_id term, label_id label) const { auto p = 1.0 - term_and_class(term, label) - not_term_and_class(term, label) - - term_and_not_class(term, label); - if(p < 0 || p > 1) - throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + - term_and_not_class(term, label); + if (p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + + std::to_string(p)}; return p; } double feature_selector::term_and_not_class(term_id term, label_id label) const { auto p = term_prob_.at(term) - term_and_class(term, label); - if(p < 0 || p > 1) - throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + if (p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + + std::to_string(p)}; return p; } double feature_selector::not_term_and_class(term_id term, label_id label) const { auto p = class_prob_.at(label - 1) - term_and_class(term, label); - if(p < 0 || p > 1) - throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; + if (p < 0 || p > 1) + throw std::runtime_error{std::string{__func__} + ": " + + std::to_string(p)}; return p; } } From 01c2f6f2b5e256b37ddb5ab701f1df3f968bd386 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 18 Jun 2015 20:18:55 -0500 Subject: [PATCH 151/481] only check feature probability bounds if DEBUG is set --- src/features/feature_selector.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp index d397299f5..438522957 100644 --- a/src/features/feature_selector.cpp +++ b/src/features/feature_selector.cpp @@ -172,27 +172,33 @@ void feature_selector::print_summary(uint64_t k /* = 20 */) const double feature_selector::prob_term(term_id id) const { auto p = term_prob_.at(id); +#if DEBUG if (p < 0 || p > 1) throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; +#endif return p; } double feature_selector::prob_class(label_id id) const { auto p = class_prob_.at(id - 1); +#if DEBUG if (p < 0 || p > 1) throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; +#endif return p; } double feature_selector::term_and_class(term_id term, label_id label) const { auto p = co_occur_.at(label - 1).at(term); +#if DEBUG if (p < 0 || p > 1) throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; +#endif return p; } @@ -201,27 +207,33 @@ double feature_selector::not_term_and_not_class(term_id term, { auto p = 1.0 - term_and_class(term, label) - not_term_and_class(term, label) - term_and_not_class(term, label); +#if DEBUG if (p < 0 || p > 1) throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; +#endif return p; } double feature_selector::term_and_not_class(term_id term, label_id label) const { auto p = term_prob_.at(term) - term_and_class(term, label); +#if DEBUG if (p < 0 || p > 1) throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; +#endif return p; } double feature_selector::not_term_and_class(term_id term, label_id label) const { auto p = class_prob_.at(label - 1) - term_and_class(term, label); +#if DEBUG if (p < 0 || p > 1) throw std::runtime_error{std::string{__func__} + ": " + std::to_string(p)}; +#endif return p; } } From b05793ffcee6ba49a440438f51f51b1a6edaf495 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 19 Jun 2015 17:38:58 -0500 Subject: [PATCH 152/481] document language_model class and provide default config settings --- config.toml | 3 +++ include/lm/language_model.h | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/config.toml b/config.toml index da6154435..a770191ca 100644 --- a/config.toml +++ b/config.toml @@ -85,3 +85,6 @@ section-size = 99 train-sections = [2, 21] dev-sections = [22, 22] test-sections = [23, 23] + +[language-model] +arpa-file = "../data/english-sentences.arpa" diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 9e69c59f4..1973ec380 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -21,6 +21,23 @@ namespace meta { namespace lm { +/** + * A very simple language model class that reads existing language model data + * from a .arpa file. Currently, estimation of the language model is not + * implemented. We recommend using KenLM to generate a .arpa file from a text + * corpus that has been (optionally) preprocessed by MeTA. + * + * @see http://www.speech.sri.com/projects/srilm/manpages/ngram-format.5.html + * @see https://kheafield.com/code/kenlm/ + * + * Required config parameters: + * ~~~toml + * [language-model] + * arpa-file = "path-to-file" + * ~~~ + * + * Optional config parameters: none. + */ class language_model { private: @@ -41,6 +58,8 @@ class language_model float backoff; }; + // This structure could be switched out for something more efficient, such + // as a static linear probing hash table using map_t = std::unordered_map; public: From 3a0e2c39e82928fc05c0c4ec2459ab0715a63d1d Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 19 Jun 2015 17:40:21 -0500 Subject: [PATCH 153/481] remove the outdated [language-model] config section from config.toml --- config.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/config.toml b/config.toml index a770191ca..0eb6c647f 100644 --- a/config.toml +++ b/config.toml @@ -59,10 +59,6 @@ insert-penalty = 0.0 substitute-penalty = 0.0 remove-penalty = 0.0 -[language-model] -format = "learn" -n-value = 3 - [features] method = "info-gain" prefix = "features" From b9d68b1d62885939f8cf1048937b6f15fd142c8b Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 19 Jun 2015 17:44:07 -0500 Subject: [PATCH 154/481] provide usage info for create-dataset application --- src/lm/tools/create-dataset.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/lm/tools/create-dataset.cpp b/src/lm/tools/create-dataset.cpp index 8ab4cfd22..35de29f99 100644 --- a/src/lm/tools/create-dataset.cpp +++ b/src/lm/tools/create-dataset.cpp @@ -14,6 +14,13 @@ using namespace meta; int main(int argc, char* argv[]) { + if (argc != 3) + { + std::cerr << "Usage:\t" << argv[0] << " config.toml input.txt" + << std::endl; + return 1; + } + bool diagnostic = true; auto config = cpptoml::parse_file(argv[1]); lm::diff correcter{*config.get_table("diff")}; From e3cb94054182c63427c4601acea589c2cb753819 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 20 Jun 2015 13:42:23 -0500 Subject: [PATCH 155/481] basic feature selection unit tests --- include/test/features_test.h | 28 ++++++++++++++ src/test/CMakeLists.txt | 4 +- src/test/features_test.cpp | 73 ++++++++++++++++++++++++++++++++++++ src/test/tools/unit-test.cpp | 4 ++ src/test/unit_tests.cmake | 4 ++ 5 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 include/test/features_test.h create mode 100644 src/test/features_test.cpp diff --git a/include/test/features_test.h b/include/test/features_test.h new file mode 100644 index 000000000..b136152d9 --- /dev/null +++ b/include/test/features_test.h @@ -0,0 +1,28 @@ +/** + * @file features_test.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_FEATURES_TEST_H_ +#define META_FEATURES_TEST_H_ + +#include "test/unit_test.h" + +namespace meta +{ +namespace testing +{ + +/** + * Runs all the feature selection tests. + * @return the number of tests failed + */ +int features_tests(); + +} +} +#endif diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index f1344f77a..1e81ae6c1 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -15,9 +15,11 @@ add_library(meta-testing analyzer_test.cpp stemmer_test.cpp string_list_test.cpp graph_test.cpp + features_test.cpp vocabulary_map_test.cpp parser_test.cpp) -target_link_libraries(meta-testing meta-index meta-classify meta-parser) +target_link_libraries(meta-testing meta-index meta-classify meta-parser + meta-features) set(UNIT_TEST_EXE unit-test) include(unit_tests.cmake) diff --git a/src/test/features_test.cpp b/src/test/features_test.cpp new file mode 100644 index 000000000..4c35a509e --- /dev/null +++ b/src/test/features_test.cpp @@ -0,0 +1,73 @@ +/** + * @file features_test.cpp + * @author Sean Massung + */ + +#include "test/features_test.h" +#include "test/inverted_index_test.h" +#include "features/feature_selector.h" +#include "features/selector_factory.h" +#include "features/all.h" + +namespace meta +{ +namespace testing +{ + +namespace +{ +template +void test_construction(Index& idx, const std::string& id) +{ + std::ofstream fconfig{"feature-config.toml"}; + fconfig << "[features]\nmethod = \"" << id << "\"\n" + << "prefix = \"test-features\""; + fconfig.close(); + auto config = cpptoml::parse_file("feature-config.toml"); + auto selector = features::make_selector(config, idx); + selector->select(20); + selector->select(50); + selector->select_percent(0.05); + selector->select_percent(0.10); + + auto t_id = idx->get_term_id("china"); // this term should be selected + ASSERT(selector->selected(t_id)); + + // CEEAUS has three classes + ASSERT(filesystem::file_exists("test-features." + id + ".1")); + ASSERT(filesystem::file_exists("test-features." + id + ".2")); + ASSERT(filesystem::file_exists("test-features." + id + ".3")); + ASSERT(filesystem::file_exists("test-features." + id + ".selected")); +} +} + +int features_tests() +{ + int failed = 0; + create_config("line"); + + // scope for forward index object + { + auto f_idx = index::make_index( + "test-config.toml"); + + failed += testing::run_test("chi-square", [&]() + { + test_construction(f_idx, "chi-square"); + }); + failed += testing::run_test("info-gain", [&]() + { + test_construction(f_idx, "info-gain"); + }); + failed += testing::run_test("corr-coef", [&]() + { + test_construction(f_idx, "corr-coef"); + }); + } + + system("rm -rf ceeaus-* test-features.*"); + filesystem::delete_file("feature-config.toml"); + return failed; +} +} +} diff --git a/src/test/tools/unit-test.cpp b/src/test/tools/unit-test.cpp index 8ecd599a1..14e87ecb6 100644 --- a/src/test/tools/unit-test.cpp +++ b/src/test/tools/unit-test.cpp @@ -23,6 +23,7 @@ #include "test/parser_test.h" #include "test/lm_test.h" #include "test/filesystem_test.h" +#include "test/features_test.h" #include "util/printing.h" using namespace meta; @@ -51,6 +52,7 @@ int main(int argc, char* argv[]) std::cerr << " \"parser\": runs parser tests" << std::endl; std::cerr << " \"language-model\": runs language model tests" << std::endl; std::cerr << " \"filesystem\": runs filesystem tests" << std::endl; + std::cerr << " \"features\": runs feature selection tests" << std::endl; return 1; } @@ -92,6 +94,8 @@ int main(int argc, char* argv[]) num_failed += testing::lm_tests(); if (all || args.find("filesystem") != args.end()) num_failed += testing::filesystem_tests(); + if (all || args.find("features") != args.end()) + num_failed += testing::features_tests(); return num_failed; } diff --git a/src/test/unit_tests.cmake b/src/test/unit_tests.cmake index 7b605f6ed..84965daa0 100644 --- a/src/test/unit_tests.cmake +++ b/src/test/unit_tests.cmake @@ -61,3 +61,7 @@ set_tests_properties(language-model PROPERTIES TIMEOUT 10 WORKING_DIRECTORY add_test(filesystem ${UNIT_TEST_EXE} filesystem) set_tests_properties(filesystem PROPERTIES TIMEOUT 10 WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + +add_test(features ${UNIT_TEST_EXE} features) +set_tests_properties(features PROPERTIES TIMEOUT 10 WORKING_DIRECTORY + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) From 2f1f72cd3d4201d35af1812f16bc02cd5e5ff5ff Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 20 Jun 2015 14:05:37 -0500 Subject: [PATCH 156/481] update feature selector comments --- include/features/feature_selector.h | 11 +++++++++++ include/features/selector_factory.h | 6 +++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/features/feature_selector.h b/include/features/feature_selector.h index f7ad78b9b..da799aac8 100644 --- a/include/features/feature_selector.h +++ b/include/features/feature_selector.h @@ -25,6 +25,17 @@ namespace features * The base class that shows the feature selection interface for MeTA, allowing * dimensionality reduction for documents as well as descriptions of classes by * their useful features. + * + * Required config parameters: + * ~~~toml + * method = "corr-coef" # choose the feature selection algorithm + * prefix = "file-prefix" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * features-per-class = 20 # default + * ~~~ */ class feature_selector { diff --git a/include/features/selector_factory.h b/include/features/selector_factory.h index a9a153728..d1b6a2ee6 100644 --- a/include/features/selector_factory.h +++ b/include/features/selector_factory.h @@ -65,9 +65,9 @@ std::unique_ptr std::shared_ptr idx); /** - * Factory method for creating a ranker. This should be specialized if - * your given ranker requires special construction behavior (e.g., - * reading parameters). + * Factory method for creating a feature selector. This should be specialized if + * your selector requires special construction behavior (e.g., reading + * parameters). */ template std::unique_ptr From 5d6d79c1246fe41c5e75817ee2715ee5cdf2bce1 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 25 Jun 2015 14:52:55 -0500 Subject: [PATCH 157/481] add odds ratio feature selection --- include/features/all.h | 1 + include/features/odds_ratio.h | 44 +++++++++++++++++++++++++++++++ src/features/CMakeLists.txt | 1 + src/features/odds_ratio.cpp | 28 ++++++++++++++++++++ src/features/selector_factory.cpp | 1 + 5 files changed, 75 insertions(+) create mode 100644 include/features/odds_ratio.h create mode 100644 src/features/odds_ratio.cpp diff --git a/include/features/all.h b/include/features/all.h index 7bcdce320..b2ca4e583 100644 --- a/include/features/all.h +++ b/include/features/all.h @@ -1,3 +1,4 @@ #include "features/chi_square.h" #include "features/information_gain.h" #include "features/correlation_coefficient.h" +#include "features/odds_ratio.h" diff --git a/include/features/odds_ratio.h b/include/features/odds_ratio.h new file mode 100644 index 000000000..de184fb61 --- /dev/null +++ b/include/features/odds_ratio.h @@ -0,0 +1,44 @@ +/** + * @file odds_ratio.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_ODDS_RATIO_H_ +#define META_ODDS_RATIO_H_ + +#include "features/feature_selector.h" + +namespace meta +{ +namespace features +{ +/** + * Performs odds ratio feature selection: + * \f$ OR(t, c_i) = + * \log \frac{P(t|c_i)(1-P(t|\overline{c_i}))}{(1-P(t|c_i))P(t|\overline{c_i})} + * \f$ + */ +class odds_ratio: public feature_selector +{ + public: + /// Inherit constructor. + using feature_selector::feature_selector; + + /// Identifier for this feature_selector. + const static std::string id; + + /** + * Scores the (label_id, term) pair according to this feature selection + * metric. + * @param lid + * @param tid + */ + virtual double score(label_id lid, term_id tid) const override; +}; +} +} +#endif diff --git a/src/features/CMakeLists.txt b/src/features/CMakeLists.txt index e87998125..c5ef7f4f1 100644 --- a/src/features/CMakeLists.txt +++ b/src/features/CMakeLists.txt @@ -5,6 +5,7 @@ add_subdirectory(tools) add_library(meta-features feature_selector.cpp selector_factory.cpp chi_square.cpp + odds_ratio.cpp correlation_coefficient.cpp information_gain.cpp) target_link_libraries(meta-features meta-index ${CMAKE_THREAD_LIBS_INIT}) diff --git a/src/features/odds_ratio.cpp b/src/features/odds_ratio.cpp new file mode 100644 index 000000000..f163fb0fb --- /dev/null +++ b/src/features/odds_ratio.cpp @@ -0,0 +1,28 @@ +/** + * @file odds_ratio.cpp + * @author Sean Massung + */ + +#include "features/odds_ratio.h" + +namespace meta +{ +namespace features +{ +const std::string odds_ratio::id = "odds-ratio"; + +double odds_ratio::score(label_id lid, term_id tid) const +{ + double p_tc = term_and_class(tid, lid); + double p_tnc = term_and_not_class(tid, lid); + double numerator = p_tc * (1.0 - p_tnc); + double denominator = (1.0 - p_tc) * p_tnc; + + // avoid divide by zero + if (denominator == 0.0) + return 0.0; + + return std::log(numerator / denominator); +} +} +} diff --git a/src/features/selector_factory.cpp b/src/features/selector_factory.cpp index 8b455d6de..a308ea954 100644 --- a/src/features/selector_factory.cpp +++ b/src/features/selector_factory.cpp @@ -24,6 +24,7 @@ selector_factory::selector_factory() reg(); reg(); reg(); + reg(); } std::unique_ptr From 5807b30334c1dfb0e22ba30ebb08a2b56a980d18 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 25 Jun 2015 14:53:20 -0500 Subject: [PATCH 158/481] include odds ratio in feature selector unit tests --- src/test/features_test.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/test/features_test.cpp b/src/test/features_test.cpp index 4c35a509e..e30510af7 100644 --- a/src/test/features_test.cpp +++ b/src/test/features_test.cpp @@ -63,6 +63,10 @@ int features_tests() { test_construction(f_idx, "corr-coef"); }); + failed += testing::run_test("odds-ratio", [&]() + { + test_construction(f_idx, "odds-ratio"); + }); } system("rm -rf ceeaus-* test-features.*"); From c6ff39475567dd8e3099f971b4c034d65d387d5b Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 31 Jul 2015 11:58:45 -0500 Subject: [PATCH 159/481] write static_probe_map for lm --- include/lm/language_model.h | 9 +- include/util/static_probe_map.h | 191 ++++++++++++++++++++++++++++++ include/util/static_probe_map.tcc | 79 ++++++++++++ src/lm/language_model.cpp | 8 +- 4 files changed, 283 insertions(+), 4 deletions(-) create mode 100644 include/util/static_probe_map.h create mode 100644 include/util/static_probe_map.tcc diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 1973ec380..68d3479c6 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -16,6 +16,7 @@ #include #include "cpptoml.h" #include "lm/sentence.h" +#include "util/static_probe_map.h" namespace meta { @@ -54,13 +55,19 @@ class language_model { } + bool operator==(const lm_node& other) const + { + return prob == other.prob && backoff == other.backoff; + } + float prob; float backoff; }; // This structure could be switched out for something more efficient, such // as a static linear probing hash table - using map_t = std::unordered_map; + using map_t = util::static_probe_map; + //using map_t = std::unordered_map; public: /** diff --git a/include/util/static_probe_map.h b/include/util/static_probe_map.h new file mode 100644 index 000000000..28a11bce9 --- /dev/null +++ b/include/util/static_probe_map.h @@ -0,0 +1,191 @@ +/** + * @file static_probe_map.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_STATIC_PROBE_MAP_H_ +#define META_STATIC_PROBE_MAP_H_ + +#include +#include + +namespace meta +{ +namespace util +{ + +/** + * + */ +template +class static_probe_map +{ + public: + /** + * Constructor. + * @param elems The number of elements that will be stored in this map. + * Note that the storage required will be more than this amount in order to + * have an acceptable load factor. + */ + static_probe_map(uint64_t num_elems); + + Value& operator[](const Key& key); + + const Value& operator[](const Key& key) const; + + Value& at(const Key& key); + + const Value& at(const Key& key) const; + + /** + * The "inner" iterator representation of the static_probe_map. + */ + using InnerIterator = + typename std::vector>::const_iterator; + + /** + * The static_probe_map iterator is really just a wrapper for the internal + * vector> iterator. + */ + class Iterator + : public std::iterator + { + private: + /// The iterator of the underlying vector + InnerIterator iter; + + public: + /// Constructor. + Iterator() + { + /* nothing */ + } + + /// Copy constructor. + Iterator(const InnerIterator& other) : iter{other} + { + /* nothing */ + } + + /// Pre-Increment. + Iterator& operator++() + { + ++iter; + return *this; + } + + /// Post-increment. + Iterator operator++(int) + { + InnerIterator save{iter}; + ++iter; + return Iterator{save}; + } + + /// Pre-decrement. + Iterator& operator--() + { + --iter; + return *this; + } + + /// Post-decrement. + Iterator operator--(int) + { + InnerIterator save{iter}; + --iter; + return Iterator{save}; + } + + /// Iterator equality. + bool operator==(const Iterator& other) + { + return iter == other.iter; + } + + /// Iterator inequality. + bool operator!=(const Iterator& other) + { + return iter != other.iter; + } + + /** + * Dereference operator. Returns the underlying value_type, + * which will always be a std::pair + * @return a reference to the value of the object that is dereferenced + */ + const typename InnerIterator::value_type& operator*() + { + return *iter; + } + + /** + * Arrow operator. Returns a pointer to the underlying + * value_type, which will always be a std::pair + * @return a pointer to the value of the object that is dereferenced + */ + const typename InnerIterator::value_type* operator->() + { + return &(*iter); + } + }; + + /** + * Easier typename to deal with if capital, also lets const_iterator + * share same name + */ + typedef Iterator iterator; + + /// Lets const_iterator be interchangeable with "iterator" + typedef Iterator const_iterator; + + Iterator find(const Key& key) const; + + private: + /// The internal map representing Key -> Value pairs + std::vector> table_; + + /// Hash function for this hash table + std::hash hash_; + + /// Empty pair + const std::pair empty_; + + public: + /** + * @return an iterator to the beginning of this container + */ + const_iterator begin() const + { + auto it = table_.begin(); + while (*it == empty_ && it != table_.end()) + ++it; + return it; + } + + /** + * @return an iterator to the end of this container + */ + const_iterator end() const + { + return table_.end(); + } + + /** + * Basic exception for static_probe_map interactions. + */ + class static_probe_map_exception : public std::runtime_error + { + public: + using std::runtime_error::runtime_error; + }; +}; +} +} + +#include "util/static_probe_map.tcc" +#endif diff --git a/include/util/static_probe_map.tcc b/include/util/static_probe_map.tcc new file mode 100644 index 000000000..819df2ad8 --- /dev/null +++ b/include/util/static_probe_map.tcc @@ -0,0 +1,79 @@ +/** + * @file static_probe_map.tcc + * @author Sean Massung + */ + +namespace meta +{ +namespace util +{ +template +static_probe_map::static_probe_map(uint64_t num_elems) + : table_(num_elems / 0.7) // load factor of 0.7 +{ +} + +template +Value& static_probe_map::operator[](const Key& key) +{ + auto idx = hash_(key) % table_.size(); + while (true) + { + if (table_[idx] == empty_) + { + table_[idx].first = key; + return table_[idx].second; + } + + if (table_[idx].first == key) + return table_[idx].second; + + idx = (idx + 1) % table_.size(); + } +} + +template +Value& static_probe_map::at(const Key& key) +{ + return (*this)[key]; +} + +template +const Value& static_probe_map::at(const Key& key) const +{ + return (*this)[key]; +} + +template +const Value& static_probe_map::operator[](const Key& key) const +{ + auto idx = hash_(key) % table_.size(); + while (true) + { + if (table_[idx] == empty_) + throw static_probe_map_exception{"key does not exist"}; + + if (table_[idx].first == key) + return table_[idx].second; + + idx = (idx + 1) % table_.size(); + } +} + +template +auto static_probe_map::find(const Key& key) const -> Iterator +{ + auto idx = hash_(key) % table_.size(); + while (true) + { + if (table_[idx] == empty_) + return end(); + + if (table_[idx].first == key) + return {table_.begin() + idx}; + + idx = (idx + 1) % table_.size(); + } +} +} +} diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 6cdbdd623..4a5fbfe13 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -7,13 +7,13 @@ * project. */ -#include #include #include #include "util/time.h" #include "util/shim.h" #include "lm/language_model.h" #include "logging/logger.h" +#include "util/static_probe_map.h" namespace meta { @@ -55,12 +55,14 @@ void language_model::read_arpa_format(const std::string& arpa_file) lm_.emplace_back(count[N_]); // add current n-value data while (std::getline(infile, buffer)) { - if (buffer.empty()) + // if blank or end + if (buffer.empty() || (buffer[0] == '\\' && buffer[1] == 'e')) continue; + // if start of new ngram data if (buffer[0] == '\\') { - lm_.emplace_back(count[N_++]); // add current n-value data + lm_.emplace_back(count[++N_]); // add current n-value data continue; } From 73c32ee81879fc38005a3f17d097893950a8e284 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 31 Jul 2015 12:08:47 -0500 Subject: [PATCH 160/481] fix bug in lm where extra (empty) lm probs were added --- src/lm/language_model.cpp | 5 ++--- src/test/lm_test.cpp | 9 ++++----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 6cdbdd623..522d03a18 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -7,7 +7,6 @@ * project. */ -#include #include #include #include "util/time.h" @@ -55,12 +54,12 @@ void language_model::read_arpa_format(const std::string& arpa_file) lm_.emplace_back(count[N_]); // add current n-value data while (std::getline(infile, buffer)) { - if (buffer.empty()) + if (buffer.empty() || (buffer[0] == '\\' && buffer[1] == 'e')) continue; if (buffer[0] == '\\') { - lm_.emplace_back(count[N_++]); // add current n-value data + lm_.emplace_back(count[++N_]); // add current n-value data continue; } diff --git a/src/test/lm_test.cpp b/src/test/lm_test.cpp index 48ac16e83..b61989b79 100644 --- a/src/test/lm_test.cpp +++ b/src/test/lm_test.cpp @@ -2,7 +2,6 @@ * @file lm_test.cpp * @author Sean Massung */ - #include "lm/sentence.h" #include "lm/language_model.h" #include "test/lm_test.h" @@ -31,10 +30,10 @@ int lm_tests() lm::sentence s3{" Hello world ! ", false}; lm::sentence s4{" xyz xyz xyz ", false}; - ASSERT_APPROX_EQUAL(model.log_prob(s1), -5.0682507); - ASSERT_APPROX_EQUAL(model.log_prob(s2), -11.7275571); - ASSERT_APPROX_EQUAL(model.log_prob(s3), -11.0764951); - ASSERT_APPROX_EQUAL(model.log_prob(s4), -16.4180412); + ASSERT_APPROX_EQUAL(model.log_prob(s1), -13.58225155); + ASSERT_APPROX_EQUAL(model.log_prob(s2), -16.32878304); + ASSERT_APPROX_EQUAL(model.log_prob(s3), -11.07649517); + ASSERT_APPROX_EQUAL(model.log_prob(s4), -16.41804123); }); return num_failed; From fbc51eacaef77e0071ac8caa1b36b3764c8c5aaf Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 2 Aug 2015 12:18:48 -0500 Subject: [PATCH 161/481] use hash(key) instead of key; faster and less memory but can't do top_k --- include/util/static_probe_map.h | 10 ++++------ include/util/static_probe_map.tcc | 23 +++++++++++++---------- src/lm/language_model.cpp | 2 ++ src/lm/tools/lm-test.cpp | 15 +++++++++++++++ 4 files changed, 34 insertions(+), 16 deletions(-) diff --git a/include/util/static_probe_map.h b/include/util/static_probe_map.h index 28a11bce9..75892a77a 100644 --- a/include/util/static_probe_map.h +++ b/include/util/static_probe_map.h @@ -12,6 +12,7 @@ #include #include +#include "util/disk_vector.h" namespace meta { @@ -45,7 +46,7 @@ class static_probe_map * The "inner" iterator representation of the static_probe_map. */ using InnerIterator = - typename std::vector>::const_iterator; + typename std::vector>::const_iterator; /** * The static_probe_map iterator is really just a wrapper for the internal @@ -147,14 +148,11 @@ class static_probe_map private: /// The internal map representing Key -> Value pairs - std::vector> table_; + std::vector> table_; /// Hash function for this hash table std::hash hash_; - /// Empty pair - const std::pair empty_; - public: /** * @return an iterator to the beginning of this container @@ -162,7 +160,7 @@ class static_probe_map const_iterator begin() const { auto it = table_.begin(); - while (*it == empty_ && it != table_.end()) + while (it->first == uint64_t{0} && it != table_.end()) ++it; return it; } diff --git a/include/util/static_probe_map.tcc b/include/util/static_probe_map.tcc index 819df2ad8..3c2d6d9a1 100644 --- a/include/util/static_probe_map.tcc +++ b/include/util/static_probe_map.tcc @@ -16,16 +16,17 @@ static_probe_map::static_probe_map(uint64_t num_elems) template Value& static_probe_map::operator[](const Key& key) { - auto idx = hash_(key) % table_.size(); + auto hashed = hash_(key); + auto idx = hashed % table_.size(); while (true) { - if (table_[idx] == empty_) + if (table_[idx].first == uint64_t{0}) { - table_[idx].first = key; + table_[idx].first = hashed; return table_[idx].second; } - if (table_[idx].first == key) + if (table_[idx].first == hashed) return table_[idx].second; idx = (idx + 1) % table_.size(); @@ -47,13 +48,14 @@ const Value& static_probe_map::at(const Key& key) const template const Value& static_probe_map::operator[](const Key& key) const { - auto idx = hash_(key) % table_.size(); + auto hashed = hash_(key); + auto idx = hashed % table_.size(); while (true) { - if (table_[idx] == empty_) + if (table_[idx].first == uint64_t{0}) throw static_probe_map_exception{"key does not exist"}; - if (table_[idx].first == key) + if (table_[idx].first == hashed) return table_[idx].second; idx = (idx + 1) % table_.size(); @@ -63,13 +65,14 @@ const Value& static_probe_map::operator[](const Key& key) const template auto static_probe_map::find(const Key& key) const -> Iterator { - auto idx = hash_(key) % table_.size(); + auto hashed = hash_(key); + auto idx = hashed % table_.size(); while (true) { - if (table_[idx] == empty_) + if (table_[idx].first == uint64_t{0}) return end(); - if (table_[idx].first == key) + if (table_[idx].first == hashed) return {table_.begin() + idx}; idx = (idx + 1) % table_.size(); diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 4a5fbfe13..426721477 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -87,6 +87,7 @@ std::vector> return a.second > b.second; }; std::vector candidates; + /* sentence candidate = prev; candidate.push_back("word"); // the last item is replaced each iteration for (auto& word : lm_[0]) @@ -103,6 +104,7 @@ std::vector> for (auto end = candidates.end(); end != candidates.begin(); --end) std::pop_heap(candidates.begin(), end, comp); + */ return candidates; } diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index eb7614202..42763f41f 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -26,6 +26,20 @@ int main(int argc, char* argv[]) logging::set_cerr_logging(); + lm::language_model model{cpptoml::parse_file(argv[1])}; + std::string line; + std::ifstream in{argv[2]}; + while (in) + { + std::getline(in, line); + if (line.empty()) + continue; + + lm::sentence sent{line}; + std::cout << model.log_prob(sent) << std::endl; + } + + /* lm::diff correcter{cpptoml::parse_file(argv[1])}; std::ifstream in{argv[2]}; auto num_sentences = filesystem::num_lines(argv[2]); @@ -66,4 +80,5 @@ int main(int argc, char* argv[]) prog.end(); std::cout << "Percent no-ops: " << do_nothing / done << std::endl; + */ } From 3cb00ff26f1e687c71c48a3c098beb8e4f738781 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 2 Aug 2015 21:06:59 -0500 Subject: [PATCH 162/481] specialize static_probe_map for language_model --- include/analyzers/diff_analyzer.h | 2 +- include/lm/diff.h | 2 + include/lm/language_model.h | 34 +----- include/lm/lm_node.h | 50 ++++++++ include/lm/static_probe_map.h | 81 +++++++++++++ include/util/static_probe_map.h | 189 ------------------------------ include/util/static_probe_map.tcc | 82 ------------- src/analyzers/diff_analyzer.cpp | 4 +- src/lm/CMakeLists.txt | 1 + src/lm/language_model.cpp | 64 ++++++---- src/lm/static_probe_map.cpp | 61 ++++++++++ 11 files changed, 242 insertions(+), 328 deletions(-) create mode 100644 include/lm/lm_node.h create mode 100644 include/lm/static_probe_map.h delete mode 100644 include/util/static_probe_map.h delete mode 100644 include/util/static_probe_map.tcc create mode 100644 src/lm/static_probe_map.cpp diff --git a/include/analyzers/diff_analyzer.h b/include/analyzers/diff_analyzer.h index 6fa978b22..0e7bc0571 100644 --- a/include/analyzers/diff_analyzer.h +++ b/include/analyzers/diff_analyzer.h @@ -48,7 +48,7 @@ class diff_analyzer : public util::clonable /// The token stream to be used for extracting tokens std::unique_ptr stream_; - lm::diff diff_; + std::shared_ptr diff_; }; /** diff --git a/include/lm/diff.h b/include/lm/diff.h index e796a111e..f7b1f9b2b 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -27,6 +27,8 @@ class diff */ diff(const cpptoml::table& config); + diff(diff&&) = default; + /** * @param sent The sentence to transform * @param use_lm diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 68d3479c6..fa1014826 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -16,7 +16,7 @@ #include #include "cpptoml.h" #include "lm/sentence.h" -#include "util/static_probe_map.h" +#include "lm/static_probe_map.h" namespace meta { @@ -41,34 +41,6 @@ namespace lm */ class language_model { - private: - /** - * Simple struct to keep track of probabilities and backoff values. - */ - struct lm_node - { - lm_node() : prob{0.0f}, backoff{0.0f} - { - } - - lm_node(float p, float b) : prob{p}, backoff{b} - { - } - - bool operator==(const lm_node& other) const - { - return prob == other.prob && backoff == other.backoff; - } - - float prob; - float backoff; - }; - - // This structure could be switched out for something more efficient, such - // as a static linear probing hash table - using map_t = util::static_probe_map; - //using map_t = std::unordered_map; - public: /** * Creates an N-gram language model based on the corpus specified in the @@ -76,6 +48,8 @@ class language_model */ language_model(const cpptoml::table& config); + language_model(language_model&&) = default; + /** * @param sentence A sequence of tokens * @return the perplexity of this token sequence given the current language @@ -120,7 +94,7 @@ class language_model uint64_t N_; /// The "n" value for this n-gram language model - std::vector lm_; + std::vector lm_; }; class language_model_exception : public std::runtime_error diff --git a/include/lm/lm_node.h b/include/lm/lm_node.h new file mode 100644 index 000000000..44c57135f --- /dev/null +++ b/include/lm/lm_node.h @@ -0,0 +1,50 @@ +/** + * @file lm_node.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_LM_NODE_H_ +#define META_LM_NODE_H_ + +#include + +namespace meta +{ +namespace lm +{ +/** + * Simple struct to keep track of probabilities and backoff values that is + * packed into a uint64_t for storage. + */ +struct lm_node +{ + lm_node() : prob{0.0f}, backoff{0.0f} + { + } + + lm_node(float p, float b) : prob{p}, backoff{b} + { + } + + lm_node(uint64_t packed) + { + prob = *reinterpret_cast(&packed); + backoff = *(reinterpret_cast(&packed) + 1); + } + + bool operator==(const lm_node& other) const + { + return prob == other.prob && backoff == other.backoff; + } + + float prob; + float backoff; +}; +} +} + +#endif diff --git a/include/lm/static_probe_map.h b/include/lm/static_probe_map.h new file mode 100644 index 000000000..77f0b39de --- /dev/null +++ b/include/lm/static_probe_map.h @@ -0,0 +1,81 @@ +/** + * @file static_probe_map.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_STATIC_PROBE_MAP_H_ +#define META_STATIC_PROBE_MAP_H_ + +#include +#include "lm/lm_node.h" +#include "util/disk_vector.h" +#include "util/optional.h" + +namespace meta +{ +namespace lm +{ +/** + * Represents language model probabilities as string -> (prob, backoff) values. + * For space and time efficiency, this class only stores the uint64_t hash of + * the string keys, so it is not possible to query which keys exist in the + * table. The values of (prob, backoff) are stored as two packed floats in a + * uint64_t. The use of uint64_t allows storage to exist in a util::disk_vector, + * which makes loading after the initial creation relatively fast. + */ +class static_probe_map +{ + static_assert(sizeof(float) == 4, "two floats need to occupy 8 bytes!"); + + public: + /** + * Constructor. + * @param num_elems The number of elements that will be stored in this map. + * Note that the storage required will be more than this amount in order to + * have an acceptable load factor. If num_elems is zero, binary LM files + * are loaded. + */ + static_probe_map(const std::string& filename, uint64_t num_elems = 0); + + static_probe_map(static_probe_map&&) = default; + + /** + * @param key The string key to search for in this probe map + * @return an optional language model node containing the probability and + * backoff value for the key + */ + util::optional find(const std::string& key) const; + + /** + * @param key The string key to insert (though only a uint64_t hash is + * stored; if the hash already exists, an exception is thrown) + * @param prob The probability of the key in this LM + * @param backoff The backoff probability for this LM + */ + void insert(const std::string& key, float prob, float backoff); + + private: + /// The internal map representing std::string -> lm_node pairs + util::disk_vector table_; + + /// Convert strings to uint64_ts + std::hash hash_; + + public: + /** + * Basic exception for static_probe_map interactions. + */ + class static_probe_map_exception : public std::runtime_error + { + public: + using std::runtime_error::runtime_error; + }; +}; +} +} + +#endif diff --git a/include/util/static_probe_map.h b/include/util/static_probe_map.h deleted file mode 100644 index 75892a77a..000000000 --- a/include/util/static_probe_map.h +++ /dev/null @@ -1,189 +0,0 @@ -/** - * @file static_probe_map.h - * @author Sean Massung - * - * All files in META are dual-licensed under the MIT and NCSA licenses. For more - * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the - * project. - */ - -#ifndef META_STATIC_PROBE_MAP_H_ -#define META_STATIC_PROBE_MAP_H_ - -#include -#include -#include "util/disk_vector.h" - -namespace meta -{ -namespace util -{ - -/** - * - */ -template -class static_probe_map -{ - public: - /** - * Constructor. - * @param elems The number of elements that will be stored in this map. - * Note that the storage required will be more than this amount in order to - * have an acceptable load factor. - */ - static_probe_map(uint64_t num_elems); - - Value& operator[](const Key& key); - - const Value& operator[](const Key& key) const; - - Value& at(const Key& key); - - const Value& at(const Key& key) const; - - /** - * The "inner" iterator representation of the static_probe_map. - */ - using InnerIterator = - typename std::vector>::const_iterator; - - /** - * The static_probe_map iterator is really just a wrapper for the internal - * vector> iterator. - */ - class Iterator - : public std::iterator - { - private: - /// The iterator of the underlying vector - InnerIterator iter; - - public: - /// Constructor. - Iterator() - { - /* nothing */ - } - - /// Copy constructor. - Iterator(const InnerIterator& other) : iter{other} - { - /* nothing */ - } - - /// Pre-Increment. - Iterator& operator++() - { - ++iter; - return *this; - } - - /// Post-increment. - Iterator operator++(int) - { - InnerIterator save{iter}; - ++iter; - return Iterator{save}; - } - - /// Pre-decrement. - Iterator& operator--() - { - --iter; - return *this; - } - - /// Post-decrement. - Iterator operator--(int) - { - InnerIterator save{iter}; - --iter; - return Iterator{save}; - } - - /// Iterator equality. - bool operator==(const Iterator& other) - { - return iter == other.iter; - } - - /// Iterator inequality. - bool operator!=(const Iterator& other) - { - return iter != other.iter; - } - - /** - * Dereference operator. Returns the underlying value_type, - * which will always be a std::pair - * @return a reference to the value of the object that is dereferenced - */ - const typename InnerIterator::value_type& operator*() - { - return *iter; - } - - /** - * Arrow operator. Returns a pointer to the underlying - * value_type, which will always be a std::pair - * @return a pointer to the value of the object that is dereferenced - */ - const typename InnerIterator::value_type* operator->() - { - return &(*iter); - } - }; - - /** - * Easier typename to deal with if capital, also lets const_iterator - * share same name - */ - typedef Iterator iterator; - - /// Lets const_iterator be interchangeable with "iterator" - typedef Iterator const_iterator; - - Iterator find(const Key& key) const; - - private: - /// The internal map representing Key -> Value pairs - std::vector> table_; - - /// Hash function for this hash table - std::hash hash_; - - public: - /** - * @return an iterator to the beginning of this container - */ - const_iterator begin() const - { - auto it = table_.begin(); - while (it->first == uint64_t{0} && it != table_.end()) - ++it; - return it; - } - - /** - * @return an iterator to the end of this container - */ - const_iterator end() const - { - return table_.end(); - } - - /** - * Basic exception for static_probe_map interactions. - */ - class static_probe_map_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; -}; -} -} - -#include "util/static_probe_map.tcc" -#endif diff --git a/include/util/static_probe_map.tcc b/include/util/static_probe_map.tcc deleted file mode 100644 index 3c2d6d9a1..000000000 --- a/include/util/static_probe_map.tcc +++ /dev/null @@ -1,82 +0,0 @@ -/** - * @file static_probe_map.tcc - * @author Sean Massung - */ - -namespace meta -{ -namespace util -{ -template -static_probe_map::static_probe_map(uint64_t num_elems) - : table_(num_elems / 0.7) // load factor of 0.7 -{ -} - -template -Value& static_probe_map::operator[](const Key& key) -{ - auto hashed = hash_(key); - auto idx = hashed % table_.size(); - while (true) - { - if (table_[idx].first == uint64_t{0}) - { - table_[idx].first = hashed; - return table_[idx].second; - } - - if (table_[idx].first == hashed) - return table_[idx].second; - - idx = (idx + 1) % table_.size(); - } -} - -template -Value& static_probe_map::at(const Key& key) -{ - return (*this)[key]; -} - -template -const Value& static_probe_map::at(const Key& key) const -{ - return (*this)[key]; -} - -template -const Value& static_probe_map::operator[](const Key& key) const -{ - auto hashed = hash_(key); - auto idx = hashed % table_.size(); - while (true) - { - if (table_[idx].first == uint64_t{0}) - throw static_probe_map_exception{"key does not exist"}; - - if (table_[idx].first == hashed) - return table_[idx].second; - - idx = (idx + 1) % table_.size(); - } -} - -template -auto static_probe_map::find(const Key& key) const -> Iterator -{ - auto hashed = hash_(key); - auto idx = hashed % table_.size(); - while (true) - { - if (table_[idx].first == uint64_t{0}) - return end(); - - if (table_[idx].first == hashed) - return {table_.begin() + idx}; - - idx = (idx + 1) % table_.size(); - } -} -} -} diff --git a/src/analyzers/diff_analyzer.cpp b/src/analyzers/diff_analyzer.cpp index 0e41f05f1..c1505d99a 100644 --- a/src/analyzers/diff_analyzer.cpp +++ b/src/analyzers/diff_analyzer.cpp @@ -19,7 +19,7 @@ const std::string diff_analyzer::id = "diff"; diff_analyzer::diff_analyzer(const cpptoml::table& config, std::unique_ptr stream) - : stream_{std::move(stream)}, diff_{config} + : stream_{std::move(stream)}, diff_{std::make_shared(config)} { // nothing } @@ -55,7 +55,7 @@ void diff_analyzer::tokenize(corpus::document& doc) try { lm::sentence sent{s}; - auto candidates = diff_.candidates(sent, true); + auto candidates = diff_->candidates(sent, true); auto edits = candidates[0].first.operations(); if (edits.empty()) doc.increment("unmodified", 1); diff --git a/src/lm/CMakeLists.txt b/src/lm/CMakeLists.txt index fdc94d928..48473b38e 100644 --- a/src/lm/CMakeLists.txt +++ b/src/lm/CMakeLists.txt @@ -4,6 +4,7 @@ add_subdirectory(tools) add_library(meta-language-model language_model.cpp diff.cpp + static_probe_map.cpp sentence.cpp) target_link_libraries(meta-language-model meta-corpus meta-analyzers diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 426721477..8fbe9088f 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -13,7 +13,6 @@ #include "util/shim.h" #include "lm/language_model.h" #include "logging/logger.h" -#include "util/static_probe_map.h" namespace meta { @@ -23,13 +22,28 @@ namespace lm language_model::language_model(const cpptoml::table& config) { auto table = config.get_table("language-model"); - auto arpa_file = table->get_as("arpa-file"); - LOG(info) << "Loading language model from .arpa file... " << ENDLG; - auto time = common::time([&]() - { - read_arpa_format(*arpa_file); - }); - LOG(info) << "Done. (" << time.count() << "ms)" << ENDLG; + N_ = 0; + if (filesystem::file_exists("0.binlm")) + { + LOG(info) << "Loading language model from binary file..." << ENDLG; + auto time = common::time( + [&]() + { + while (filesystem::file_exists(std::to_string(N_) + ".binlm")) + lm_.emplace_back(std::to_string(N_++) + ".binlm"); + }); + LOG(info) << "Done. (" << time.count() << "ms)" << ENDLG; + } + else + { + auto arpa_file = table->get_as("arpa-file"); + LOG(info) << "Loading language model from .arpa file... " << ENDLG; + auto time = common::time([&]() + { + read_arpa_format(*arpa_file); + }); + LOG(info) << "Done. (" << time.count() << "ms)" << ENDLG; + } } void language_model::read_arpa_format(const std::string& arpa_file) @@ -51,8 +65,7 @@ void language_model::read_arpa_format(const std::string& arpa_file) break; } - N_ = 0; - lm_.emplace_back(count[N_]); // add current n-value data + lm_.emplace_back(std::to_string(N_) + ".binlm", count[N_]); while (std::getline(infile, buffer)) { // if blank or end @@ -62,7 +75,8 @@ void language_model::read_arpa_format(const std::string& arpa_file) // if start of new ngram data if (buffer[0] == '\\') { - lm_.emplace_back(count[++N_]); // add current n-value data + ++N_; + lm_.emplace_back(std::to_string(N_) + ".binlm", count[N_]); continue; } @@ -73,8 +87,10 @@ void language_model::read_arpa_format(const std::string& arpa_file) float backoff = 0.0; if (second_tab != std::string::npos) backoff = std::stof(buffer.substr(second_tab + 1)); - lm_[N_][ngram] = {prob, backoff}; + lm_[N_].insert(ngram, prob, backoff); } + + ++N_; } std::vector> @@ -116,30 +132,30 @@ float language_model::prob_calc(sentence tokens) const if (tokens.size() == 1) { - auto it = lm_[0].find(tokens[0]); - if (it != lm_[0].end()) - return it->second.prob; - return lm_[0].at("").prob; + auto opt = lm_[0].find(tokens[0]); + if (opt) + return opt->prob; + return lm_[0].find("")->prob; } else { - auto it = lm_[tokens.size() - 1].find(tokens.to_string()); - if (it != lm_[tokens.size() - 1].end()) - return it->second.prob; + auto opt = lm_[tokens.size() - 1].find(tokens.to_string()); + if (opt) + return opt->prob; auto hist = tokens(0, tokens.size() - 1); tokens.pop_front(); if (tokens.size() == 1) { hist = hist(0, 1); - auto it = lm_[0].find(hist[0]); - if (it == lm_[0].end()) + auto opt = lm_[0].find(hist[0]); + if (!opt) hist.substitute(0, ""); } - it = lm_[hist.size() - 1].find(hist.to_string()); - if (it != lm_[hist.size() - 1].end()) - return it->second.backoff + prob_calc(tokens); + opt = lm_[hist.size() - 1].find(hist.to_string()); + if (opt) + return opt->backoff + prob_calc(tokens); return prob_calc(tokens); } } diff --git a/src/lm/static_probe_map.cpp b/src/lm/static_probe_map.cpp new file mode 100644 index 000000000..9716747de --- /dev/null +++ b/src/lm/static_probe_map.cpp @@ -0,0 +1,61 @@ +/** + * @file static_probe_map.cpp + * @author Sean Massung + */ + +#include "lm/static_probe_map.h" + +namespace meta +{ +namespace lm +{ +static_probe_map::static_probe_map(const std::string& filename, + uint64_t num_elems) + : table_{filename, static_cast((num_elems / 0.7) * 2)} +// load factor of 0.7; x2 for keys and vals +{ +} + +void static_probe_map::insert(const std::string& key, float prob, float backoff) +{ + auto hashed = hash_(key); + auto idx = (hashed % (table_.size() / 2)) * 2; + + while (true) + { + if (table_[idx] == uint64_t{0}) + { + table_[idx] = hashed; + // pack prob and float into uint64_t slot next to key val + uint64_t& ref = table_[idx + 1]; + *reinterpret_cast(&ref) = prob; + *(reinterpret_cast(&ref) + 1) = backoff; + return; + } + + if (table_[idx] == hashed) + throw static_probe_map_exception{ + "key already exists (or collision)"}; + + idx = (idx + 2) % table_.size(); + } +} + +util::optional static_probe_map::find(const std::string& key) const +{ + auto hashed = hash_(key); + auto idx = (hashed % (table_.size() / 2)) * 2; + + while (true) + { + if (table_[idx] == uint64_t{0}) + return util::nullopt; + + if (table_[idx] == hashed) + return util::optional{lm_node{table_[idx + 1]}}; + + idx = (idx + 2) % table_.size(); + } +} +} +} From cc71a43e57df4f526bdb9d6e6e1f55c76abb5f31 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 2 Aug 2015 21:07:42 -0500 Subject: [PATCH 163/481] revert incorrect changes from 73c32ee81879f, which used (n-1)-gram probs --- src/test/lm_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/lm_test.cpp b/src/test/lm_test.cpp index b61989b79..75116a3ec 100644 --- a/src/test/lm_test.cpp +++ b/src/test/lm_test.cpp @@ -30,8 +30,8 @@ int lm_tests() lm::sentence s3{" Hello world ! ", false}; lm::sentence s4{" xyz xyz xyz ", false}; - ASSERT_APPROX_EQUAL(model.log_prob(s1), -13.58225155); - ASSERT_APPROX_EQUAL(model.log_prob(s2), -16.32878304); + ASSERT_APPROX_EQUAL(model.log_prob(s1), -5.0682507); + ASSERT_APPROX_EQUAL(model.log_prob(s2), -11.7275571); ASSERT_APPROX_EQUAL(model.log_prob(s3), -11.07649517); ASSERT_APPROX_EQUAL(model.log_prob(s4), -16.41804123); }); From 4e0790643721484a66029378b10d291b66d9595e Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 2 Aug 2015 21:20:52 -0500 Subject: [PATCH 164/481] use string hash function that takes all chars into account --- include/lm/static_probe_map.h | 8 +++++--- src/lm/static_probe_map.cpp | 12 ++++++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/lm/static_probe_map.h b/include/lm/static_probe_map.h index 77f0b39de..b01c2a4af 100644 --- a/include/lm/static_probe_map.h +++ b/include/lm/static_probe_map.h @@ -59,12 +59,14 @@ class static_probe_map void insert(const std::string& key, float prob, float backoff); private: + /** + * Mersenne prime string hash + */ + uint64_t hash(const std::string& str) const; + /// The internal map representing std::string -> lm_node pairs util::disk_vector table_; - /// Convert strings to uint64_ts - std::hash hash_; - public: /** * Basic exception for static_probe_map interactions. diff --git a/src/lm/static_probe_map.cpp b/src/lm/static_probe_map.cpp index 9716747de..5a4bb3a67 100644 --- a/src/lm/static_probe_map.cpp +++ b/src/lm/static_probe_map.cpp @@ -18,7 +18,7 @@ static_probe_map::static_probe_map(const std::string& filename, void static_probe_map::insert(const std::string& key, float prob, float backoff) { - auto hashed = hash_(key); + auto hashed = hash(key); auto idx = (hashed % (table_.size() / 2)) * 2; while (true) @@ -43,7 +43,7 @@ void static_probe_map::insert(const std::string& key, float prob, float backoff) util::optional static_probe_map::find(const std::string& key) const { - auto hashed = hash_(key); + auto hashed = hash(key); auto idx = (hashed % (table_.size() / 2)) * 2; while (true) @@ -57,5 +57,13 @@ util::optional static_probe_map::find(const std::string& key) const idx = (idx + 2) % table_.size(); } } + +uint64_t static_probe_map::hash(const std::string& str) const +{ + uint64_t result = 2166136261; + for (auto& ch : str) + result = 127 * result + static_cast(ch); + return result; +} } } From aad7c28c55c72f3acda2f6251bb593bbe17df6ae Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 3 Aug 2015 11:54:54 -0500 Subject: [PATCH 165/481] replace reinterpret_cast with std::memcpy --- include/lm/lm_node.h | 6 ++++-- src/lm/static_probe_map.cpp | 9 ++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/lm/lm_node.h b/include/lm/lm_node.h index 44c57135f..311e7e788 100644 --- a/include/lm/lm_node.h +++ b/include/lm/lm_node.h @@ -11,6 +11,7 @@ #define META_LM_NODE_H_ #include +#include namespace meta { @@ -32,8 +33,9 @@ struct lm_node lm_node(uint64_t packed) { - prob = *reinterpret_cast(&packed); - backoff = *(reinterpret_cast(&packed) + 1); + uint32_t buf = packed >> 32; + std::memcpy(&prob, &packed, sizeof(float)); + std::memcpy(&backoff, &buf, sizeof(float)); } bool operator==(const lm_node& other) const diff --git a/src/lm/static_probe_map.cpp b/src/lm/static_probe_map.cpp index 5a4bb3a67..8a43d7884 100644 --- a/src/lm/static_probe_map.cpp +++ b/src/lm/static_probe_map.cpp @@ -3,6 +3,7 @@ * @author Sean Massung */ +#include #include "lm/static_probe_map.h" namespace meta @@ -26,10 +27,12 @@ void static_probe_map::insert(const std::string& key, float prob, float backoff) if (table_[idx] == uint64_t{0}) { table_[idx] = hashed; + // pack prob and float into uint64_t slot next to key val - uint64_t& ref = table_[idx + 1]; - *reinterpret_cast(&ref) = prob; - *(reinterpret_cast(&ref) + 1) = backoff; + uint64_t buf = 0; + std::memcpy(&table_[idx + 1], &prob, sizeof(float)); + std::memcpy(&buf, &backoff, sizeof(float)); + table_[idx + 1] |= (buf << 32); return; } From 3bae802b3dcd9262c1dc4a0dcc3d9bb64f93fa36 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 3 Aug 2015 12:39:41 -0500 Subject: [PATCH 166/481] language_model support for binary file prefix config option --- config.toml | 1 + include/lm/language_model.h | 17 ++++++++++++++--- src/lm/language_model.cpp | 24 ++++++++++++++++-------- src/test/inverted_index_test.cpp | 4 +++- 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/config.toml b/config.toml index 0eb6c647f..1ef5d67bd 100644 --- a/config.toml +++ b/config.toml @@ -84,3 +84,4 @@ test-sections = [23, 23] [language-model] arpa-file = "../data/english-sentences.arpa" +binary-file-prefix = "english-sentences-" diff --git a/include/lm/language_model.h b/include/lm/language_model.h index fa1014826..94635515e 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -34,10 +34,15 @@ namespace lm * Required config parameters: * ~~~toml * [language-model] - * arpa-file = "path-to-file" + * binary-file-prefix = "path-to-binary-files" + * # only this key is needed if the LM is already binarized * ~~~ * - * Optional config parameters: none. + * Optional config parameters: + * ~~~toml + * [language-model] + * arpa-file = "path-to-arpa-file" # if no binary files have yet been created + * ~~~ */ class language_model { @@ -48,6 +53,9 @@ class language_model */ language_model(const cpptoml::table& config); + /** + * Default move constructor. + */ language_model(language_model&&) = default; /** @@ -83,8 +91,11 @@ class language_model /** * Reads precomputed LM data into this object. * @param arpa_file The path to the ARPA-formatted file + * @param binary_prefix Prefix to store the precomputed language model + * mappings derived from the .arpa file */ - void read_arpa_format(const std::string& arpa_file); + void read_arpa_format(const std::string& arpa_file, + const std::string& binary_prefix); /** * @param tokens diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 8fbe9088f..f8e843dea 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -22,31 +22,38 @@ namespace lm language_model::language_model(const cpptoml::table& config) { auto table = config.get_table("language-model"); + auto arpa_file = table->get_as("arpa-file"); + auto binary_file = table->get_as("binary-file-prefix"); + N_ = 0; - if (filesystem::file_exists("0.binlm")) + if (binary_file && filesystem::file_exists(*binary_file + "0.binlm")) { LOG(info) << "Loading language model from binary file..." << ENDLG; auto time = common::time( [&]() { - while (filesystem::file_exists(std::to_string(N_) + ".binlm")) + while (filesystem::file_exists(*binary_file + std::to_string(N_) + + ".binlm")) lm_.emplace_back(std::to_string(N_++) + ".binlm"); }); LOG(info) << "Done. (" << time.count() << "ms)" << ENDLG; } - else + else if (arpa_file && binary_file) { - auto arpa_file = table->get_as("arpa-file"); LOG(info) << "Loading language model from .arpa file... " << ENDLG; auto time = common::time([&]() { - read_arpa_format(*arpa_file); + read_arpa_format(*arpa_file, *binary_file); }); LOG(info) << "Done. (" << time.count() << "ms)" << ENDLG; } + else + throw language_model_exception{ + "arpa-file or binary-file-prefix needed in config file"}; } -void language_model::read_arpa_format(const std::string& arpa_file) +void language_model::read_arpa_format(const std::string& arpa_file, + const std::string& binary_prefix) { std::ifstream infile{arpa_file}; std::string buffer; @@ -65,7 +72,7 @@ void language_model::read_arpa_format(const std::string& arpa_file) break; } - lm_.emplace_back(std::to_string(N_) + ".binlm", count[N_]); + lm_.emplace_back(binary_prefix + std::to_string(N_) + ".binlm", count[N_]); while (std::getline(infile, buffer)) { // if blank or end @@ -76,7 +83,8 @@ void language_model::read_arpa_format(const std::string& arpa_file) if (buffer[0] == '\\') { ++N_; - lm_.emplace_back(std::to_string(N_) + ".binlm", count[N_]); + lm_.emplace_back(binary_prefix + std::to_string(N_) + ".binlm", + count[N_]); continue; } diff --git a/src/test/inverted_index_test.cpp b/src/test/inverted_index_test.cpp index a711c64e9..28f7476d1 100644 --- a/src/test/inverted_index_test.cpp +++ b/src/test/inverted_index_test.cpp @@ -53,7 +53,9 @@ void create_config(const std::string& corpus_type) << "method = \"ngram-word\"\n" << "ngram = 1\n" << "filter = \"default-chain\"\n" - << "[language-model]\narpa-file = \"../data/english-sentences.arpa\""; + << "[language-model]\n" + << "arpa-file = \"../data/english-sentences.arpa\"\n" + << "binary-file-prefix = \"test-lm-\""; } template From 2fffe030fc695096b08bd788a61fe533b75255c2 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 3 Aug 2015 15:44:35 -0500 Subject: [PATCH 167/481] add (very slow) top_k functionality back into language_model --- include/lm/language_model.h | 11 ++++++---- src/lm/diff.cpp | 4 +--- src/lm/language_model.cpp | 44 +++++++++++++++++++++++++++---------- src/lm/sentence.cpp | 3 --- src/lm/tools/lm-test.cpp | 4 ++-- 5 files changed, 42 insertions(+), 24 deletions(-) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 94635515e..c532beafe 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -91,11 +91,8 @@ class language_model /** * Reads precomputed LM data into this object. * @param arpa_file The path to the ARPA-formatted file - * @param binary_prefix Prefix to store the precomputed language model - * mappings derived from the .arpa file */ - void read_arpa_format(const std::string& arpa_file, - const std::string& binary_prefix); + void read_arpa_format(const std::string& arpa_file); /** * @param tokens @@ -103,9 +100,15 @@ class language_model */ float prob_calc(sentence tokens) const; + void load_vocab(); + uint64_t N_; /// The "n" value for this n-gram language model std::vector lm_; + + std::vector vocabulary_; + + std::string prefix_; }; class language_model_exception : public std::runtime_error diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 45c248727..7bdebaa40 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -3,8 +3,6 @@ * @author Sean Massung */ -#include - #include #include #include "lm/diff.h" @@ -111,7 +109,7 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) best.pop_back(); try { - for (auto& next : lm_.top_k(best, 5)) + for (auto& next : lm_.top_k(best, 3)) { if (next.first == "") continue; diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index f8e843dea..fb03b3e5c 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -34,26 +34,30 @@ language_model::language_model(const cpptoml::table& config) { while (filesystem::file_exists(*binary_file + std::to_string(N_) + ".binlm")) - lm_.emplace_back(std::to_string(N_++) + ".binlm"); + lm_.emplace_back(*binary_file + std::to_string(N_++) + + ".binlm"); }); LOG(info) << "Done. (" << time.count() << "ms)" << ENDLG; + prefix_ = *binary_file; } else if (arpa_file && binary_file) { LOG(info) << "Loading language model from .arpa file... " << ENDLG; + prefix_ = *binary_file; auto time = common::time([&]() { - read_arpa_format(*arpa_file, *binary_file); + read_arpa_format(*arpa_file); }); LOG(info) << "Done. (" << time.count() << "ms)" << ENDLG; } else throw language_model_exception{ "arpa-file or binary-file-prefix needed in config file"}; + + load_vocab(); } -void language_model::read_arpa_format(const std::string& arpa_file, - const std::string& binary_prefix) +void language_model::read_arpa_format(const std::string& arpa_file) { std::ifstream infile{arpa_file}; std::string buffer; @@ -72,7 +76,8 @@ void language_model::read_arpa_format(const std::string& arpa_file, break; } - lm_.emplace_back(binary_prefix + std::to_string(N_) + ".binlm", count[N_]); + lm_.emplace_back(prefix_ + std::to_string(N_) + ".binlm", count[N_]); + std::ofstream unigrams{prefix_ + "0.strings"}; while (std::getline(infile, buffer)) { // if blank or end @@ -83,7 +88,7 @@ void language_model::read_arpa_format(const std::string& arpa_file, if (buffer[0] == '\\') { ++N_; - lm_.emplace_back(binary_prefix + std::to_string(N_) + ".binlm", + lm_.emplace_back(prefix_ + std::to_string(N_) + ".binlm", count[N_]); continue; } @@ -96,6 +101,9 @@ void language_model::read_arpa_format(const std::string& arpa_file, if (second_tab != std::string::npos) backoff = std::stof(buffer.substr(second_tab + 1)); lm_[N_].insert(ngram, prob, backoff); + + if (N_ == 0) + unigrams << ngram << std::endl; } ++N_; @@ -111,13 +119,13 @@ std::vector> return a.second > b.second; }; std::vector candidates; - /* + sentence candidate = prev; candidate.push_back("word"); // the last item is replaced each iteration - for (auto& word : lm_[0]) + for (auto& word : vocabulary_) { - auto candidate = sentence{prev.to_string() + " " + word.first}; - candidates.emplace_back(word.first, log_prob(candidate)); + auto candidate = sentence{prev.to_string() + " " + word}; + candidates.emplace_back(word, log_prob(candidate)); std::push_heap(candidates.begin(), candidates.end(), comp); if (candidates.size() > k) { @@ -128,11 +136,23 @@ std::vector> for (auto end = candidates.end(); end != candidates.begin(); --end) std::pop_heap(candidates.begin(), end, comp); - */ return candidates; } +void language_model::load_vocab() +{ + std::string word; + std::ifstream unigrams{prefix_ + "0.strings"}; + while (std::getline(unigrams, word)) + { + if (word.empty()) + continue; + + vocabulary_.push_back(word); + } +} + float language_model::prob_calc(sentence tokens) const { if (tokens.size() == 0) @@ -174,7 +194,7 @@ float language_model::log_prob(sentence tokens) const // tokens < N sentence ngram; - for (uint64_t i = 0; i < N_ - 1; ++i) + for (uint64_t i = 0; i < N_ - 1 && i < tokens.size(); ++i) { ngram.push_back(tokens[i]); prob += prob_calc(ngram); diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 71cee7a95..886f43260 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -23,10 +23,7 @@ sentence::sentence(const std::string& text, bool tokenize /* = true */) { using namespace analyzers; std::unique_ptr stream; - stream = make_unique(); stream = make_unique(); - stream = make_unique(std::move(stream)); - stream = make_unique(std::move(stream)); stream = make_unique(std::move(stream)); std::string text_copy{text}; // consider changing parameter to non-const stream->set_content(std::move(text_copy)); diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index 42763f41f..228d38c42 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -26,6 +26,7 @@ int main(int argc, char* argv[]) logging::set_cerr_logging(); + /* lm::language_model model{cpptoml::parse_file(argv[1])}; std::string line; std::ifstream in{argv[2]}; @@ -38,8 +39,8 @@ int main(int argc, char* argv[]) lm::sentence sent{line}; std::cout << model.log_prob(sent) << std::endl; } + */ - /* lm::diff correcter{cpptoml::parse_file(argv[1])}; std::ifstream in{argv[2]}; auto num_sentences = filesystem::num_lines(argv[2]); @@ -80,5 +81,4 @@ int main(int argc, char* argv[]) prog.end(); std::cout << "Percent no-ops: " << do_nothing / done << std::endl; - */ } From 7e3ba57888cefbaa1c849057f444f31c83a50c18 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 3 Aug 2015 15:50:30 -0500 Subject: [PATCH 168/481] test creating and reading from binary LM files --- src/test/lm_test.cpp | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/src/test/lm_test.cpp b/src/test/lm_test.cpp index 75116a3ec..7b27c03d8 100644 --- a/src/test/lm_test.cpp +++ b/src/test/lm_test.cpp @@ -17,24 +17,32 @@ int lm_tests() int num_failed = 0; create_config("line"); - num_failed += testing::run_test( - "lm-test", [&]() - { - lm::language_model model{cpptoml::parse_file("test-config.toml")}; - lm::sentence s1{ - " I disagree with this statement for several reasons . ", - false}; - lm::sentence s2{ - " I disagree with this octopus for several reasons . ", - false}; - lm::sentence s3{" Hello world ! ", false}; - lm::sentence s4{" xyz xyz xyz ", false}; + auto test = [&]() + { + lm::language_model model{cpptoml::parse_file("test-config.toml")}; + lm::sentence s1{ + " I disagree with this statement for several reasons . ", + false}; + lm::sentence s2{ + " I disagree with this octopus for several reasons . ", + false}; + lm::sentence s3{" Hello world ! ", false}; + lm::sentence s4{" xyz xyz xyz ", false}; - ASSERT_APPROX_EQUAL(model.log_prob(s1), -5.0682507); - ASSERT_APPROX_EQUAL(model.log_prob(s2), -11.7275571); - ASSERT_APPROX_EQUAL(model.log_prob(s3), -11.07649517); - ASSERT_APPROX_EQUAL(model.log_prob(s4), -16.41804123); - }); + ASSERT_APPROX_EQUAL(model.log_prob(s1), -5.0682507); + ASSERT_APPROX_EQUAL(model.log_prob(s2), -11.7275571); + ASSERT_APPROX_EQUAL(model.log_prob(s3), -11.07649517); + ASSERT_APPROX_EQUAL(model.log_prob(s4), -16.41804123); + }; + + // recreate binary LM files each test even if they already exist + filesystem::delete_file("test-lm-0.binlm"); + filesystem::delete_file("test-lm-1.binlm"); + filesystem::delete_file("test-lm-2.binlm"); + filesystem::delete_file("test-lm-0.strings"); + + num_failed += testing::run_test("lm-test", test); + num_failed += testing::run_test("lm-test-read-binary", test); return num_failed; } From 59213b767e071c448a557fe7b50b3121b3de44e3 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 7 Aug 2015 15:26:27 -0500 Subject: [PATCH 169/481] fix std::hash specialization using util::string_view --- include/util/string_view.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/util/string_view.h b/include/util/string_view.h index cf45a92ec..208059d54 100644 --- a/include/util/string_view.h +++ b/include/util/string_view.h @@ -609,8 +609,9 @@ struct hash> size_t operator()( const meta::util::basic_string_view& view) const noexcept { - static constexpr meta::util::murmur_hash<> hasher{}; - return hasher(view.data(), view.size()); + static meta::util::murmur_hash<> hasher{}; + return hasher(reinterpret_cast(view.data()), + view.size()); } }; } From 6e2168c16f00233009edef7354bbb774b0df60e7 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 7 Aug 2015 15:31:24 -0500 Subject: [PATCH 170/481] murmur_hash updates - accept seed in ctor - make operator() const - use detail namespace --- include/util/hash.h | 71 +++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/include/util/hash.h b/include/util/hash.h index b75a90046..e999ea429 100644 --- a/include/util/hash.h +++ b/include/util/hash.h @@ -11,6 +11,7 @@ #define META_UTIL_HASH_H_ #include +#include namespace meta { @@ -22,9 +23,9 @@ namespace util * will return a 32-bit or 64-bit hash value. */ template -struct murmur_hash; +class murmur_hash; -namespace +namespace detail { inline uint32_t rotl(uint32_t x, int8_t r) { @@ -63,18 +64,25 @@ inline uint64_t fmix(uint64_t h) * Murmur3Hash for 32-bit outputs. Based on MurmurHash3_x86_32. */ template <> -struct murmur_hash<4> +class murmur_hash<4> { - constexpr murmur_hash() = default; + public: + murmur_hash() : seed_{std::random_device{}()} + { + } + + murmur_hash(std::size_t seed) : seed_{seed} + { + } - std::size_t operator()(const uint8_t* data, int len, uint32_t seed) + std::size_t operator()(const uint8_t* data, int len) const { - std::size_t out = seed; + std::size_t out = seed_; const auto nblocks = len / 4; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; auto blocks = reinterpret_cast(data + nblocks * 4); @@ -83,11 +91,11 @@ struct murmur_hash<4> auto k1 = blocks[i]; k1 *= c1; - k1 = rotl(k1, 15); + k1 = detail::rotl(k1, 15); k1 *= c2; out ^= k1; - out = rotl(out, 13); + out = detail::rotl(out, 13); out = out * 5 + 0xe6546b64; } @@ -103,31 +111,41 @@ struct murmur_hash<4> case 1: k1 ^= tail[0]; k1 *= c1; - k1 = rotl(k1, 15); + k1 = detail::rotl(k1, 15); k1 *= c2; out ^= k1; } out ^= len; - return fmix(out); + return detail::fmix(out); } + + private: + std::size_t seed_; }; /** * MurmurHash3 for 64-bit outputs. Based on MurmurHash3_x64_128. */ template <> -struct murmur_hash<8> +class murmur_hash<8> { - constexpr murmur_hash() = default; + public: + murmur_hash() : seed_{std::random_device{}()} + { + } + + murmur_hash(uint64_t seed) : seed_{seed} + { + } - std::size_t operator()(const uint8_t* data, int len, uint64_t seed) + std::size_t operator()(const uint8_t* data, int len) const { const auto nblocks = len / 16; - auto h1 = seed; - auto h2 = seed; + auto h1 = seed_; + auto h2 = seed_; const uint64_t c1 = 0x87c37b91114253d5LLU; const uint64_t c2 = 0x4cf5ad432745937fLLU; @@ -140,20 +158,20 @@ struct murmur_hash<8> auto k2 = blocks[i * 2 + 1]; k1 *= c1; - k1 = rotl(k1, 31); + k1 = detail::rotl(k1, 31); k1 *= c2; h1 ^= k1; - h1 = rotl(h1, 27); + h1 = detail::rotl(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729; k2 *= c2; - k2 = rotl(k2, 33); + k2 = detail::rotl(k2, 33); k2 *= c1; h2 ^= k2; - h2 = rotl(h2, 31); + h2 = detail::rotl(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5; } @@ -180,7 +198,7 @@ struct murmur_hash<8> case 9: k2 ^= static_cast(tail[8]); k2 *= c2; - k2 = rotl(k2, 33); + k2 = detail::rotl(k2, 33); k2 *= c1; h2 ^= k2; @@ -201,7 +219,7 @@ struct murmur_hash<8> case 1: k1 ^= static_cast(tail[0]); k1 *= c1; - k1 = rotl(k1, 31); + k1 = detail::rotl(k1, 31); k1 *= c2; h1 ^= k1; } @@ -212,14 +230,17 @@ struct murmur_hash<8> h1 += h2; h2 += h1; - h1 = fmix(h1); - h2 = fmix(h2); + h1 = detail::fmix(h1); + h2 = detail::fmix(h2); h1 += h2; // h2 += h1, unneeded since we only want 64-bits. return h1; } + + private: + uint64_t seed_; }; } } From de3ddd07eaff67cb9e60cb43deb2e53cf53a1a4a Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 7 Aug 2015 15:32:48 -0500 Subject: [PATCH 171/481] static_probe_map uses util::murmur_hash --- include/lm/static_probe_map.h | 15 +++++++++++---- src/lm/static_probe_map.cpp | 8 +++----- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/lm/static_probe_map.h b/include/lm/static_probe_map.h index b01c2a4af..72b2ae432 100644 --- a/include/lm/static_probe_map.h +++ b/include/lm/static_probe_map.h @@ -14,6 +14,7 @@ #include "lm/lm_node.h" #include "util/disk_vector.h" #include "util/optional.h" +#include "util/hash.h" namespace meta { @@ -59,14 +60,20 @@ class static_probe_map void insert(const std::string& key, float prob, float backoff); private: - /** - * Mersenne prime string hash - */ - uint64_t hash(const std::string& str) const; + /// A seed for the string hash function + static constexpr uint64_t seed_ = 0x2bedf99b3aa222d9; /// The internal map representing std::string -> lm_node pairs util::disk_vector table_; + /// 64-bit hash function for strings + util::murmur_hash<> hash_; + + /** + * Helper function to hash a string with util::murmur_hash + */ + uint64_t hash(const std::string& str) const; + public: /** * Basic exception for static_probe_map interactions. diff --git a/src/lm/static_probe_map.cpp b/src/lm/static_probe_map.cpp index 8a43d7884..4aef8400b 100644 --- a/src/lm/static_probe_map.cpp +++ b/src/lm/static_probe_map.cpp @@ -12,7 +12,8 @@ namespace lm { static_probe_map::static_probe_map(const std::string& filename, uint64_t num_elems) - : table_{filename, static_cast((num_elems / 0.7) * 2)} + : table_{filename, static_cast((num_elems / 0.7) * 2)}, + hash_{seed_} // load factor of 0.7; x2 for keys and vals { } @@ -63,10 +64,7 @@ util::optional static_probe_map::find(const std::string& key) const uint64_t static_probe_map::hash(const std::string& str) const { - uint64_t result = 2166136261; - for (auto& ch : str) - result = 127 * result + static_cast(ch); - return result; + return hash_(reinterpret_cast(str.c_str()), str.size()); } } } From e3e5ad45aa36c17f8fdd2f9528f9c594dd80f6a4 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 7 Aug 2015 16:10:45 -0500 Subject: [PATCH 172/481] comment LM-related classes; simplify ret value in static_probe_map::find --- include/lm/language_model.h | 3 +++ include/lm/static_probe_map.h | 3 +++ src/lm/static_probe_map.cpp | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index c532beafe..f2bf43455 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -100,6 +100,9 @@ class language_model */ float prob_calc(sentence tokens) const; + /** + * Loads unigram vocabulary from text file to allow top_k to work. + */ void load_vocab(); uint64_t N_; /// The "n" value for this n-gram language model diff --git a/include/lm/static_probe_map.h b/include/lm/static_probe_map.h index 72b2ae432..2110f2380 100644 --- a/include/lm/static_probe_map.h +++ b/include/lm/static_probe_map.h @@ -42,6 +42,9 @@ class static_probe_map */ static_probe_map(const std::string& filename, uint64_t num_elems = 0); + /** + * Default move constructor. + */ static_probe_map(static_probe_map&&) = default; /** diff --git a/src/lm/static_probe_map.cpp b/src/lm/static_probe_map.cpp index 4aef8400b..222e8d1a0 100644 --- a/src/lm/static_probe_map.cpp +++ b/src/lm/static_probe_map.cpp @@ -56,7 +56,7 @@ util::optional static_probe_map::find(const std::string& key) const return util::nullopt; if (table_[idx] == hashed) - return util::optional{lm_node{table_[idx + 1]}}; + return {table_[idx + 1]}; idx = (idx + 2) % table_.size(); } From 16c9f3101596c5bc6f692e54450715d13d2eb6d8 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 10 Aug 2015 22:20:48 -0500 Subject: [PATCH 173/481] Add capacity measurement to util::sparse_vector. --- include/util/sparse_vector.h | 5 +++++ include/util/sparse_vector.tcc | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/include/util/sparse_vector.h b/include/util/sparse_vector.h index c4f9e5008..5b0796c06 100644 --- a/include/util/sparse_vector.h +++ b/include/util/sparse_vector.h @@ -120,6 +120,11 @@ class sparse_vector */ uint64_t size() const; + /** + * @return the total capacity of the underlying storage + */ + uint64_t capacity() const; + /** * @return whether the vector is empty */ diff --git a/include/util/sparse_vector.tcc b/include/util/sparse_vector.tcc index 330120ea0..9953edb11 100644 --- a/include/util/sparse_vector.tcc +++ b/include/util/sparse_vector.tcc @@ -127,6 +127,12 @@ uint64_t sparse_vector::size() const return storage_.size(); } +template +uint64_t sparse_vector::capacity() const +{ + return storage_.capacity(); +} + template bool sparse_vector::empty() const { From fcb7a3609486131f5064b025e2e1368d3b4b9e75 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 10 Aug 2015 22:21:16 -0500 Subject: [PATCH 174/481] Default ctors and op= for postings_data. --- include/index/postings_data.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/include/index/postings_data.h b/include/index/postings_data.h index 268100956..2b514c4b6 100644 --- a/include/index/postings_data.h +++ b/include/index/postings_data.h @@ -71,6 +71,26 @@ class postings_data */ postings_data(PrimaryKey p_id); + /** + * Postings data is copy constructable. + */ + postings_data(const postings_data&) = default; + + /** + * Postings data is move constructable. + */ + postings_data(postings_data&&) = default; + + /** + * Postings data is copy assignable. + */ + postings_data& operator=(const postings_data&) = default; + + /** + * Postings data is move assignable. + */ + postings_data& operator=(postings_data&&) = default; + /** * @param other The other postings_data object to consume * Adds the parameter's data to this object's data From b68b071aaff7e6f125cd0916df69dd49eddc2c09 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 10 Aug 2015 22:24:06 -0500 Subject: [PATCH 175/481] Remove unnecessary calls to io::read/write_binary(). These should all pretty much always be io::packed::read/write(). --- include/index/postings_data.tcc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/index/postings_data.tcc b/include/index/postings_data.tcc index 41cf6be26..fd5056f69 100644 --- a/include/index/postings_data.tcc +++ b/include/index/postings_data.tcc @@ -7,7 +7,6 @@ #include #include #include "index/postings_data.h" -#include "io/binary.h" #include "io/packed.h" namespace meta @@ -125,7 +124,7 @@ uint64_t postings_data::write_packed( { uint64_t bytes = 0; - bytes += io::write_binary(out, p_id_); + bytes += io::packed::write(out, p_id_); bytes += write_packed_counts(out); return bytes; @@ -194,8 +193,7 @@ uint64_t postings_data::read_packed(std::istream& in) else in.unget(); - io::read_binary(in, p_id_); - auto bytes = length(p_id_); + auto bytes = io::packed::read(in, p_id_); uint64_t size; uint64_t total_counts; From 5b5cad87b8e1f0493a66619dbacdcf2d1c3753ae Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 10 Aug 2015 22:24:45 -0500 Subject: [PATCH 176/481] Use capacity for memory usage estimate for postings_data. --- include/index/postings_data.tcc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/index/postings_data.tcc b/include/index/postings_data.tcc index fd5056f69..b41404781 100644 --- a/include/index/postings_data.tcc +++ b/include/index/postings_data.tcc @@ -172,7 +172,7 @@ uint64_t length(const T& elem, typename std::enable_if::value>:: type* = nullptr) { - return elem.size(); + return elem.capacity(); } template @@ -233,7 +233,7 @@ uint64_t postings_data::read_packed(std::istream& in) template uint64_t postings_data::bytes_used() const { - return sizeof(pair_t) * counts_.size() + length(p_id_); + return sizeof(pair_t) * counts_.capacity() + length(p_id_); } } } From 824817b9a70d1f41082eda62ff7e8d423db330a1 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 10 Aug 2015 23:31:16 -0500 Subject: [PATCH 177/481] Use a multi-way merge sort for the final postings merge step. The way the indexing process is configured, there will never be more than std::thread::hardware_concurrency() number of chunks at the very end of the tokenization process. So instead of spawning another thread pool to do the merging and incurring a lot of IO, we can just open each of those segments and merge them all together at the same time. This saves us significant time on larger datasets (e.g. Wikipedia) while not harming the smaller datasets. --- include/index/chunk.h | 8 -- include/index/chunk.tcc | 77 ----------- include/index/chunk_handler.h | 8 +- include/index/chunk_handler.tcc | 219 ++++++++++++++++++++++++-------- include/index/postings_data.tcc | 6 +- 5 files changed, 173 insertions(+), 145 deletions(-) diff --git a/include/index/chunk.h b/include/index/chunk.h index db03e9b25..134db804b 100644 --- a/include/index/chunk.h +++ b/include/index/chunk.h @@ -50,14 +50,6 @@ class chunk */ std::string path() const; - /** - * After this function ends, the current chunk file will contain - * information from both chunks, and the "other" chunk file will be - * deleted. - * @param other The other chunk to merge merge_with - */ - void merge_with(const chunk& other); - /** * @param pdata A collection of postings data to combine with this chunk * pdata must: diff --git a/include/index/chunk.tcc b/include/index/chunk.tcc index 91e218452..dd4f37f25 100644 --- a/include/index/chunk.tcc +++ b/include/index/chunk.tcc @@ -45,80 +45,6 @@ uint64_t chunk::size() const return size_; } -template -void chunk::merge_with(const chunk& other) -{ - std::string temp_name = path_ + "_merge"; - - std::ifstream my_data{path_, std::ios::binary}; - std::ifstream other_data{other.path_, std::ios::binary}; - std::ofstream output{temp_name, std::ios::binary}; - - postings_data my_pd; - postings_data other_pd; - my_pd.read_packed(my_data); - other_pd.read_packed(other_data); - - uint64_t terms = 0; - // merge while both have postings data - while (my_data && other_data) - { - ++terms; - if (my_pd.primary_key() == other_pd.primary_key()) - { - // merge - my_pd.merge_with(other_pd); - // write - my_pd.write_packed(output); - - // read next two postings data - my_pd.read_packed(my_data); - other_pd.read_packed(other_data); - } - else if (my_pd.primary_key() < other_pd.primary_key()) - { - // write the winner - my_pd.write_packed(output); - // read next from the current chunk - my_pd.read_packed(my_data); - } - else - { - // write the winner - other_pd.write_packed(output); - // read next from the other chunk - other_pd.read_packed(other_data); - } - } - - // finish merging when one runs out - while (my_data) - { - ++terms; - my_pd.write_packed(output); - my_pd.read_packed(my_data); - } - while (other_data) - { - ++terms; - other_pd.write_packed(output); - other_pd.read_packed(other_data); - } - - my_data.close(); - other_data.close(); - output.close(); - filesystem::delete_file(path_); - filesystem::delete_file(path_ + ".numterms"); - filesystem::delete_file(other.path_); - filesystem::delete_file(other.path_ + ".numterms"); - filesystem::rename_file(temp_name, path_); - - std::ofstream termfile{path_ + ".numterms"}; - termfile << terms; - set_size(); -} - template template void chunk::memory_merge_with(Container& pdata) @@ -172,12 +98,9 @@ void chunk::memory_merge_with(Container& pdata) my_data.close(); output.close(); filesystem::delete_file(path_); - filesystem::delete_file(path_ + ".numterms"); filesystem::rename_file(temp_name, path_); pdata.clear(); - std::ofstream termfile{path_ + ".numterms"}; - termfile << terms; set_size(); } } diff --git a/include/index/chunk_handler.h b/include/index/chunk_handler.h index 13fed717f..d427b017f 100644 --- a/include/index/chunk_handler.h +++ b/include/index/chunk_handler.h @@ -79,8 +79,12 @@ class chunk_handler /// Current size of the in-memory chunk uint64_t chunk_size_; - /// Maximum allowed size of a chunk in bytes before it is written - const static uint64_t constexpr max_size = 1024 * 1024 * 128; // 128 MB + /** + * Maximum allowed size of a chunk in bytes before it is written. + * This is an *estimate*, so you should make sure there's some slop + * in this number to make sure you don't run out of memory. + */ + const static uint64_t constexpr max_size = 1024 * 1024 * 256; // 256 MB /// Back-pointer to the handler this producer is operating on chunk_handler* parent_; diff --git a/include/index/chunk_handler.tcc b/include/index/chunk_handler.tcc index ec92f035e..03ebee8fe 100644 --- a/include/index/chunk_handler.tcc +++ b/include/index/chunk_handler.tcc @@ -16,9 +16,11 @@ namespace index template chunk_handler::producer::producer(chunk_handler* parent) - : chunk_size_{0}, parent_{parent} + : parent_{parent} { - // nothing + // sizeof(size_t): list size per bucket + // sizeof(void*): head pointer per bucket + chunk_size_ = pdata_.bucket_count() * (sizeof(size_t) + sizeof(void*)); } template @@ -33,18 +35,35 @@ void chunk_handler::producer::operator()(const secondary_key_type& key, auto it = pdata_.find(pd); if (it == pdata_.end()) { - chunk_size_ += pd.bytes_used(); + // sizeof(size_t): list size per bucket + // sizeof(void*): head pointer per bucket + chunk_size_ -= pdata_.bucket_count() + * (sizeof(size_t) + sizeof(void*)); + + // sizeof(void*): next pointer per element + chunk_size_ += pd.bytes_used() + sizeof(void*); + // 25% slop factor + chunk_size_ += (pd.bytes_used() + sizeof(void*)) / 4; + pdata_.emplace(pd); + + // sizeof(size_t): list size per bucket + // sizeof(void*): head pointer per bucket + chunk_size_ += pdata_.bucket_count() + * (sizeof(size_t) + sizeof(void*)); } else { - chunk_size_ -= it->bytes_used(); + chunk_size_ -= it->bytes_used() + sizeof(void*); + chunk_size_ -= (it->bytes_used() + sizeof(void*)) / 4; // note: we can modify elements in this set because we do not change // how comparisons are made (the primary_key value) - const_cast(*it).increase_count(key, - count.second); - chunk_size_ += it->bytes_used(); + const_cast(*it) + .increase_count(key, count.second); + + chunk_size_ += it->bytes_used() + sizeof(void*); + chunk_size_ += (it->bytes_used() + sizeof(void*)) / 4; } if (chunk_size_ >= max_size) @@ -55,16 +74,20 @@ void chunk_handler::producer::operator()(const secondary_key_type& key, template void chunk_handler::producer::flush_chunk() { - if (chunk_size_ == 0) + if (pdata_.empty()) return; std::vector pdata; for (auto it = pdata_.begin(); it != pdata_.end(); it = pdata_.erase(it)) pdata.emplace_back(std::move(*it)); + pdata_.clear(); std::sort(pdata.begin(), pdata.end()); parent_->write_chunk(pdata); - chunk_size_ = 0; + + // sizeof(size_t): list size per bucket + // sizeof(void*): head pointer per bucket + chunk_size_ = pdata_.bucket_count() * (sizeof(size_t) + sizeof(void*)); } template @@ -126,62 +149,146 @@ void chunk_handler::write_chunk(std::vector& pdata) } } +namespace detail +{ template -void chunk_handler::merge_chunks() +struct input_chunk { - size_t remaining = chunks_.size() - 1; - std::mutex mutex; - auto task = [&]() + std::unique_ptr file; + std::string path; + typename Index::index_pdata_type postings; + uint64_t total_bytes; + uint64_t bytes_read; + + input_chunk(const std::string& filename) + : file{make_unique(filename, std::ios::binary)}, + path{filename}, + total_bytes{filesystem::file_size(path)}, + bytes_read{0} { - while (true) + ++(*this); + } + + ~input_chunk() + { + if (file) + { + file = nullptr; + filesystem::delete_file(path); + } + } + + input_chunk(input_chunk&&) = default; + + input_chunk& operator=(input_chunk&& rhs) + { + if (file) { - util::optional first; - util::optional second; - { - std::lock_guard lock{mutables_}; - if (chunks_.size() < 2) - return; - first = chunks_.top(); - chunks_.pop(); - second = chunks_.top(); - chunks_.pop(); - LOG(progress) << "> Merging " << first->path() << " (" - << printing::bytes_to_units(first->size()) - << ") and " << second->path() << " (" - << printing::bytes_to_units(second->size()) - << "), " << --remaining << " remaining \r" - << ENDLG; - } - first->merge_with(*second); - { - std::lock_guard lock{mutex}; - chunks_.push(*first); - } + file = nullptr; + filesystem::delete_file(path); } - }; - parallel::thread_pool pool; - auto thread_ids = pool.thread_ids(); - std::vector> futures; - for (size_t i = 0; i < thread_ids.size(); ++i) - futures.emplace_back(pool.submit_task(task)); + file = std::move(rhs.file); + path = std::move(rhs.path); + postings = std::move(rhs.postings); + total_bytes = rhs.total_bytes; + bytes_read = rhs.bytes_read; + + return *this; + } + + operator bool() const + { + return static_cast(*file); + } - for (auto& fut : futures) - fut.get(); + bool operator<(const input_chunk& other) const + { + return postings < other.postings; + } - LOG(progress) << '\n' << ENDLG; + void operator++() + { + bytes_read += postings.read_packed(*file); + } +}; +} - if (chunks_.empty()) - throw chunk_handler_exception{"there were no chunks to merge"}; +template +void chunk_handler::merge_chunks() +{ + using input_chunk = detail::input_chunk; + std::vector to_merge; + to_merge.reserve(chunks_.size()); + while (!chunks_.empty()) + { + to_merge.emplace_back(chunks_.top().path()); + chunks_.pop(); + } + + printing::progress progress{ + " > Merging postings: ", + std::accumulate(to_merge.begin(), to_merge.end(), 0ul, + [](uint64_t acc, const input_chunk& chunk) + { + return acc + chunk.total_bytes; + })}; + std::ofstream outfile{prefix_ + "/postings.index", std::ios::binary}; + unique_primary_keys_ = 0; + + uint64_t total_read + = std::accumulate(to_merge.begin(), to_merge.end(), 0ul, + [](uint64_t acc, const input_chunk& chunk) + { + return acc + chunk.bytes_read; + }); + while (!to_merge.empty()) + { + progress(total_read); + ++(*unique_primary_keys_); - uint64_t unique_keys; - std::ifstream termfile{chunks_.top().path() + ".numterms"}; - termfile >> unique_keys; - termfile.close(); - filesystem::delete_file(chunks_.top().path() + ".numterms"); - filesystem::rename_file(chunks_.top().path(), prefix_ + "/postings.index"); + std::sort(to_merge.begin(), to_merge.end()); - unique_primary_keys_ = unique_keys; + // gather all postings that match the smallest primary key, reading + // a new postings from the corresponding file + auto range = std::equal_range(to_merge.begin(), to_merge.end(), + *to_merge.begin()); + auto min_pk = range.first->postings.primary_key(); + + using count_t = typename index_pdata_type::count_t; + std::vector to_write; + to_write.reserve(std::distance(range.first, range.second)); + std::for_each(range.first, range.second, [&](input_chunk& chunk) + { + to_write.emplace_back(chunk.postings.counts()); + auto before = chunk.bytes_read; + ++chunk; + total_read += (chunk.bytes_read - before); + }); + + // merge them all into one big counts vector + count_t counts; + std::for_each(to_write.begin(), to_write.end(), [&](count_t& pd) + { + std::move(pd.begin(), pd.end(), + std::back_inserter(counts)); + count_t{}.swap(pd); + }); + + // write out the merged counts + index_pdata_type output{std::move(min_pk)}; + output.set_counts(counts); + count_t{}.swap(counts); + output.write_packed(outfile); + + // remove all empty chunks from the input + to_merge.erase(std::remove_if(to_merge.begin(), to_merge.end(), + [](const input_chunk& chunk) + { + return !chunk; + }), + to_merge.end()); + } } template @@ -196,10 +303,10 @@ uint64_t chunk_handler::unique_primary_keys() const template uint64_t chunk_handler::final_size() const { - if (chunks_.size() != 1) + if (!chunks_.empty()) throw chunk_handler_exception{ "merge not complete before final_size() called"}; - return chunks_.top().size(); + return filesystem::file_size(prefix_ + "/postings.index"); } template diff --git a/include/index/postings_data.tcc b/include/index/postings_data.tcc index b41404781..526c09bac 100644 --- a/include/index/postings_data.tcc +++ b/include/index/postings_data.tcc @@ -132,7 +132,8 @@ uint64_t postings_data::write_packed( template template -uint64_t postings_data::write_packed_counts(std::ostream& out) const +uint64_t postings_data::write_packed_counts( + std::ostream& out) const { auto bytes = io::packed::write(out, counts_.size()); @@ -233,7 +234,8 @@ uint64_t postings_data::read_packed(std::istream& in) template uint64_t postings_data::bytes_used() const { - return sizeof(pair_t) * counts_.capacity() + length(p_id_); + return sizeof(pair_t) * counts_.capacity() + length(p_id_) + + sizeof(count_t); } } } From 86cb161d212167891089d47220cc35caef6f7013 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 10 Aug 2015 23:33:33 -0500 Subject: [PATCH 178/481] Link against jemalloc if it exists. This gives us performance improvements during indexing, since that still seems to be (unfortunately) malloc-bound. --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e3857b7c..4eb2f7846 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -155,6 +155,14 @@ if(ENABLE_PROFILING) target_link_libraries(meta-definitions INTERFACE ${GPERFTOOLS_PROFILER}) endif() +find_library(JEMALLOC_LIB NAMES jemalloc) +if(JEMALLOC_LIB) + message("-- Using jemalloc: ${JEMALLOC_LIB}") + target_link_libraries(meta-definitions INTERFACE ${JEMALLOC_LIB}) +else() + message("-- Using regular malloc; consider installing jemalloc") +endif() + if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") target_compile_definitions(meta-definitions INTERFACE -D_DARWIN_USE_64_BIT_INODE=1) From 5b50256e05cd7be1eab5463729b2fdad5b3ac187 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 10 Aug 2015 23:38:57 -0500 Subject: [PATCH 179/481] Attempt to fix travis-ci build. --- .travis.yml | 23 ++++----------------- travis/HandleOutOfTreeLLVM.patch | 11 +++++++++++ travis/install_libcxx.sh | 34 ++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 19 deletions(-) create mode 100644 travis/HandleOutOfTreeLLVM.patch create mode 100755 travis/install_libcxx.sh diff --git a/.travis.yml b/.travis.yml index ee9872f08..2aee2a0a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,9 +13,12 @@ addons: apt: sources: - ubuntu-toolchain-r-test + - llvm-toolchain-precise-3.6 packages: - g++-4.8 - libicu-dev + - llvm-3.6-dev + - clang-3.6 install: - mkdir $HOME/lib @@ -29,25 +32,7 @@ install: # use g++-4.8 if g++ is our compiler - if [ "`echo $CXX`" == "g++" ]; then export CXX=g++-4.8; fi # install libc++ if tests are run with clang++ - - if [ "`echo $CXX`" == "clang++" ]; then cwd=$(pwd); fi - - if [ "`echo $CXX`" == "clang++" ]; then svn co --quiet http://llvm.org/svn/llvm-project/libcxx/trunk libcxx; fi - - if [ "`echo $CXX`" == "clang++" ]; then git clone https://github.com/pathscale/libcxxrt.git libcxxrt; fi - - if [ "`echo $CXX`" == "clang++" ]; then cd libcxxrt; fi - - if [ "`echo $CXX`" == "clang++" ]; then mkdir build; fi - - if [ "`echo $CXX`" == "clang++" ]; then cd build; fi - - if [ "`echo $CXX`" == "clang++" ]; then cmake -DCMAKE_BUILD_TYPE=Release ../; fi - - if [ "`echo $CXX`" == "clang++" ]; then make; fi - - if [ "`echo $CXX`" == "clang++" ]; then cp lib/libcxxrt.so $HOME/lib; fi - - if [ "`echo $CXX`" == "clang++" ]; then ln -sf $HOME/lib/libcxxrt.so $HOME/lib/libcxxrt.so.1; fi - - if [ "`echo $CXX`" == "clang++" ]; then ln -sf $HOME/lib/libcxxrt.so $HOME/lib/libcxxrt.so.1.0; fi - - if [ "`echo $CXX`" == "clang++" ]; then cd $cwd; fi - - if [ "`echo $CXX`" == "clang++" ]; then cd libcxx; fi - - if [ "`echo $CXX`" == "clang++" ]; then mkdir build; fi - - if [ "`echo $CXX`" == "clang++" ]; then cd build; fi - - if [ "`echo $CXX`" == "clang++" ]; then cmake -DLIBCXX_CXX_ABI=libcxxrt -DLIBCXX_CXX_ABI_INCLUDE_PATHS="../../libcxxrt/src" -DLIBCXX_CXX_ABI_LIBRARY_PATH=$HOME/lib -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$HOME ..; fi - - if [ "`echo $CXX`" == "clang++" ]; then make; fi - - if [ "`echo $CXX`" == "clang++" ]; then make install; fi - - if [ "`echo $CXX`" == "clang++" ]; then cd $cwd; fi + - if [ "`echo $CXX`" == "clang++" ]; then export CXX=clang++-3.6 && travis/install_libcxx.sh; fi before_script: - mkdir build diff --git a/travis/HandleOutOfTreeLLVM.patch b/travis/HandleOutOfTreeLLVM.patch new file mode 100644 index 000000000..c98c600a4 --- /dev/null +++ b/travis/HandleOutOfTreeLLVM.patch @@ -0,0 +1,11 @@ +--- HandleOutOfTreeLLVM.cmake.orig 2015-08-01 20:53:23.716932808 -0500 ++++ HandleOutOfTreeLLVM.cmake 2015-08-01 20:52:34.760265353 -0500 +@@ -35,7 +35,7 @@ + set(LLVM_INCLUDE_DIR ${INCLUDE_DIR} CACHE PATH "Path to llvm/include") + set(LLVM_BINARY_DIR ${LLVM_OBJ_ROOT} CACHE PATH "Path to LLVM build tree") + set(LLVM_MAIN_SRC_DIR ${MAIN_SRC_DIR} CACHE PATH "Path to LLVM source tree") +- set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/share/llvm/cmake") ++ set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/share/llvm/cmake" CACHE PATH "Path to LLVM cmake modules") + else() + set(LLVM_FOUND OFF) + return() diff --git a/travis/install_libcxx.sh b/travis/install_libcxx.sh new file mode 100755 index 000000000..492b6d72a --- /dev/null +++ b/travis/install_libcxx.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -x +cwd=$(pwd) +svn co --quiet http://llvm.org/svn/llvm-project/libcxx/trunk libcxx +git clone https://github.com/pathscale/libcxxrt.git libcxxrt +cd libcxxrt +mkdir build +cd build +cmake -DCMAKE_BUILD_TYPE=Release .. +make +cp lib/libcxxrt.so $HOME/lib +ln -sf $HOME/lib/libcxxrt.so $HOME/lib/libcxxrt.so.1 +ln -sf $HOME/lib/libcxxrt.so $HOME/lib/libcxxrt.so.1.0 +cd $cwd +cd libcxx +cd cmake/Modules +# HORRIBLE TERRIBLE NO GOOD VERY BAD +# hack the HandleOutOfTreeLLVM.cmake module file to allow us to actually +# specify a cmake path +patch -u HandleOutOfTreeLLVM.cmake $cwd/travis/HandleOutOfTreeLLVM.patch +cd ../../ +mkdir build +cd build +cmake -DLIBCXX_CXX_ABI=libcxxrt \ + -DLIBCXX_CXX_ABI_INCLUDE_PATHS="../../libcxxrt/src" \ + -DLIBCXX_CXX_ABI_LIBRARY_PATH=$HOME/lib \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=$HOME \ + -DLLVM_CONFIG=/usr/bin/llvm-config-3.6 \ + -DLLVM_CMAKE_PATH=/usr/share/llvm-3.6/cmake \ + .. +make +make install +cd $cwd From 73299256bee6a54b75ffd34d0d7410af5c184106 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 10 Aug 2015 23:55:10 -0500 Subject: [PATCH 180/481] First attempt at building with OSX in travis-ci. --- .travis.yml | 22 ++++++++-------------- travis/cmake.sh | 7 +++++++ travis/install_linux.sh | 15 +++++++++++++++ travis/install_osx.sh | 2 ++ 4 files changed, 32 insertions(+), 14 deletions(-) create mode 100755 travis/cmake.sh create mode 100755 travis/install_linux.sh create mode 100755 travis/install_osx.sh diff --git a/.travis.yml b/.travis.yml index 2aee2a0a9..30d223259 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,10 @@ language: cpp sudo: false +os: + - linux + - osx + compiler: - clang - gcc @@ -21,18 +25,8 @@ addons: - clang-3.6 install: - - mkdir $HOME/lib - - export LD_LIBRARY_PATH=$HOME/lib:$LD_LIBRARY_PATH - - mkdir $HOME/bin - - export PATH=$HOME/bin:$PATH - - mkdir $HOME/include - - export CPLUS_INCLUDE_PATH=$HOME/include:$CPLUS_INCLUDE_PATH - - wget http://www.cmake.org/files/v3.2/cmake-3.2.2-Linux-x86_64.sh - - sh cmake-3.2.2-Linux-x86_64.sh --prefix=$HOME --exclude-subdir - # use g++-4.8 if g++ is our compiler - - if [ "`echo $CXX`" == "g++" ]; then export CXX=g++-4.8; fi - # install libc++ if tests are run with clang++ - - if [ "`echo $CXX`" == "clang++" ]; then export CXX=clang++-3.6 && travis/install_libcxx.sh; fi + - if [ "$TRAVIS_OS_NAME" == "linux"]; then travis/install_linux.sh; fi + - if [ "$TRAVIS_OS_NAME" == "osx"]; then travis/install_osx.sh; fi before_script: - mkdir build @@ -40,5 +34,5 @@ before_script: - cp ../config.toml ./ script: - - cmake ../ -DCMAKE_BUILD_TYPE=Debug && make && make clean - - rm -rf CMake* && cmake ../ -DCMAKE_BUILD_TYPE=Release && make && ctest --output-on-failure + - travis/cmake.sh Debug && make && make clean + - rm -rf CMake* && travis/cmake.sh Release && make && ctest --output-on-failure diff --git a/travis/cmake.sh b/travis/cmake.sh new file mode 100755 index 000000000..d04840e95 --- /dev/null +++ b/travis/cmake.sh @@ -0,0 +1,7 @@ +if [ "$TRAVIS_OS_NAME" == "linux" ] then + cmake -DCMAKE_BUILD_TYPE=$1 .. +fi + +if [ "$TRAVIS_OS_NAME" == "osx" ] then + cmake -DCMAKE_BUILD_TYPE=$1 -DICU_ROOT=/usr/local/opt/icu4c .. +fi diff --git a/travis/install_linux.sh b/travis/install_linux.sh new file mode 100755 index 000000000..4dc51665c --- /dev/null +++ b/travis/install_linux.sh @@ -0,0 +1,15 @@ +set -x +mkdir $HOME/lib +export LD_LIBRARY_PATH=$HOME/lib:$LD_LIBRARY_PATH +mkdir $HOME/bin +export PATH=$HOME/bin:$PATH +mkdir $HOME/include +export CPLUS_INCLUDE_PATH=$HOME/include:$CPLUS_INCLUDE_PATH +wget http://www.cmake.org/files/v3.2/cmake-3.2.2-Linux-x86_64.sh +sh cmake-3.2.2-Linux-x86_64.sh --prefix=$HOME --exclude-subdir +if [ "`echo $CXX`" == "g++" ]; then + export CXX=g++-4.8; +fi +if [ "`echo $CXX`" == "clang++" ]; then + export CXX=clang++-3.6 && travis/install_libcxx.sh +fi diff --git a/travis/install_osx.sh b/travis/install_osx.sh new file mode 100755 index 000000000..f4ea6b474 --- /dev/null +++ b/travis/install_osx.sh @@ -0,0 +1,2 @@ +set -x +brew install icu4c From c80d69ea1d1f93cb3733132e15fa5925b745cdce Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 10 Aug 2015 23:57:43 -0500 Subject: [PATCH 181/481] Fix path to travis/cmake.sh. --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 30d223259..6d08b0742 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,5 +34,5 @@ before_script: - cp ../config.toml ./ script: - - travis/cmake.sh Debug && make && make clean - - rm -rf CMake* && travis/cmake.sh Release && make && ctest --output-on-failure + - ../travis/cmake.sh Debug && make && make clean + - rm -rf CMake* && ../travis/cmake.sh Release && make && ctest --output-on-failure From 04d46d9dd93e1707dc9610c7d247d054aab4aede Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 10 Aug 2015 23:59:14 -0500 Subject: [PATCH 182/481] Fix syntax for if statements in bash scripts. --- .travis.yml | 4 ++-- travis/cmake.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6d08b0742..9c4bf85e4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,8 +25,8 @@ addons: - clang-3.6 install: - - if [ "$TRAVIS_OS_NAME" == "linux"]; then travis/install_linux.sh; fi - - if [ "$TRAVIS_OS_NAME" == "osx"]; then travis/install_osx.sh; fi + - if [ "$TRAVIS_OS_NAME" == "linux" ]; then travis/install_linux.sh; fi + - if [ "$TRAVIS_OS_NAME" == "osx" ]; then travis/install_osx.sh; fi before_script: - mkdir build diff --git a/travis/cmake.sh b/travis/cmake.sh index d04840e95..bc249be32 100755 --- a/travis/cmake.sh +++ b/travis/cmake.sh @@ -1,7 +1,7 @@ -if [ "$TRAVIS_OS_NAME" == "linux" ] then +if [ "$TRAVIS_OS_NAME" == "linux" ]; then cmake -DCMAKE_BUILD_TYPE=$1 .. fi -if [ "$TRAVIS_OS_NAME" == "osx" ] then +if [ "$TRAVIS_OS_NAME" == "osx" ]; then cmake -DCMAKE_BUILD_TYPE=$1 -DICU_ROOT=/usr/local/opt/icu4c .. fi From ff06f8cbb5b8e417188d24fed988dc62b39ebb97 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 11 Aug 2015 00:01:09 -0500 Subject: [PATCH 183/481] Source the bash scripts since they export stuff. --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9c4bf85e4..e1998a55a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,8 +25,8 @@ addons: - clang-3.6 install: - - if [ "$TRAVIS_OS_NAME" == "linux" ]; then travis/install_linux.sh; fi - - if [ "$TRAVIS_OS_NAME" == "osx" ]; then travis/install_osx.sh; fi + - if [ "$TRAVIS_OS_NAME" == "linux" ]; then source travis/install_linux.sh; fi + - if [ "$TRAVIS_OS_NAME" == "osx" ]; then source travis/install_osx.sh; fi before_script: - mkdir build From fe080c52675b483a45c93c110f1c4aadc3d27cf6 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 7 Aug 2015 15:26:27 -0500 Subject: [PATCH 184/481] fix std::hash specialization using util::string_view --- include/util/string_view.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/util/string_view.h b/include/util/string_view.h index cf45a92ec..208059d54 100644 --- a/include/util/string_view.h +++ b/include/util/string_view.h @@ -609,8 +609,9 @@ struct hash> size_t operator()( const meta::util::basic_string_view& view) const noexcept { - static constexpr meta::util::murmur_hash<> hasher{}; - return hasher(view.data(), view.size()); + static meta::util::murmur_hash<> hasher{}; + return hasher(reinterpret_cast(view.data()), + view.size()); } }; } From 4fe5328661406820bab52fd15183a11e82ef1a4e Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 7 Aug 2015 15:31:24 -0500 Subject: [PATCH 185/481] murmur_hash updates - accept seed in ctor - make operator() const - use detail namespace --- include/util/hash.h | 71 +++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/include/util/hash.h b/include/util/hash.h index b75a90046..e999ea429 100644 --- a/include/util/hash.h +++ b/include/util/hash.h @@ -11,6 +11,7 @@ #define META_UTIL_HASH_H_ #include +#include namespace meta { @@ -22,9 +23,9 @@ namespace util * will return a 32-bit or 64-bit hash value. */ template -struct murmur_hash; +class murmur_hash; -namespace +namespace detail { inline uint32_t rotl(uint32_t x, int8_t r) { @@ -63,18 +64,25 @@ inline uint64_t fmix(uint64_t h) * Murmur3Hash for 32-bit outputs. Based on MurmurHash3_x86_32. */ template <> -struct murmur_hash<4> +class murmur_hash<4> { - constexpr murmur_hash() = default; + public: + murmur_hash() : seed_{std::random_device{}()} + { + } + + murmur_hash(std::size_t seed) : seed_{seed} + { + } - std::size_t operator()(const uint8_t* data, int len, uint32_t seed) + std::size_t operator()(const uint8_t* data, int len) const { - std::size_t out = seed; + std::size_t out = seed_; const auto nblocks = len / 4; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; auto blocks = reinterpret_cast(data + nblocks * 4); @@ -83,11 +91,11 @@ struct murmur_hash<4> auto k1 = blocks[i]; k1 *= c1; - k1 = rotl(k1, 15); + k1 = detail::rotl(k1, 15); k1 *= c2; out ^= k1; - out = rotl(out, 13); + out = detail::rotl(out, 13); out = out * 5 + 0xe6546b64; } @@ -103,31 +111,41 @@ struct murmur_hash<4> case 1: k1 ^= tail[0]; k1 *= c1; - k1 = rotl(k1, 15); + k1 = detail::rotl(k1, 15); k1 *= c2; out ^= k1; } out ^= len; - return fmix(out); + return detail::fmix(out); } + + private: + std::size_t seed_; }; /** * MurmurHash3 for 64-bit outputs. Based on MurmurHash3_x64_128. */ template <> -struct murmur_hash<8> +class murmur_hash<8> { - constexpr murmur_hash() = default; + public: + murmur_hash() : seed_{std::random_device{}()} + { + } + + murmur_hash(uint64_t seed) : seed_{seed} + { + } - std::size_t operator()(const uint8_t* data, int len, uint64_t seed) + std::size_t operator()(const uint8_t* data, int len) const { const auto nblocks = len / 16; - auto h1 = seed; - auto h2 = seed; + auto h1 = seed_; + auto h2 = seed_; const uint64_t c1 = 0x87c37b91114253d5LLU; const uint64_t c2 = 0x4cf5ad432745937fLLU; @@ -140,20 +158,20 @@ struct murmur_hash<8> auto k2 = blocks[i * 2 + 1]; k1 *= c1; - k1 = rotl(k1, 31); + k1 = detail::rotl(k1, 31); k1 *= c2; h1 ^= k1; - h1 = rotl(h1, 27); + h1 = detail::rotl(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729; k2 *= c2; - k2 = rotl(k2, 33); + k2 = detail::rotl(k2, 33); k2 *= c1; h2 ^= k2; - h2 = rotl(h2, 31); + h2 = detail::rotl(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5; } @@ -180,7 +198,7 @@ struct murmur_hash<8> case 9: k2 ^= static_cast(tail[8]); k2 *= c2; - k2 = rotl(k2, 33); + k2 = detail::rotl(k2, 33); k2 *= c1; h2 ^= k2; @@ -201,7 +219,7 @@ struct murmur_hash<8> case 1: k1 ^= static_cast(tail[0]); k1 *= c1; - k1 = rotl(k1, 31); + k1 = detail::rotl(k1, 31); k1 *= c2; h1 ^= k1; } @@ -212,14 +230,17 @@ struct murmur_hash<8> h1 += h2; h2 += h1; - h1 = fmix(h1); - h2 = fmix(h2); + h1 = detail::fmix(h1); + h2 = detail::fmix(h2); h1 += h2; // h2 += h1, unneeded since we only want 64-bits. return h1; } + + private: + uint64_t seed_; }; } } From 0800f15d75671a92c807fb18fdd31335532a75bb Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 11 Aug 2015 00:06:37 -0500 Subject: [PATCH 186/481] Localize debugging in bash scripts. --- travis/install_libcxx.sh | 3 ++- travis/install_linux.sh | 3 ++- travis/install_osx.sh | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/travis/install_libcxx.sh b/travis/install_libcxx.sh index 492b6d72a..4e922cbc6 100755 --- a/travis/install_libcxx.sh +++ b/travis/install_libcxx.sh @@ -1,5 +1,5 @@ #!/bin/bash -set -x +set -v cwd=$(pwd) svn co --quiet http://llvm.org/svn/llvm-project/libcxx/trunk libcxx git clone https://github.com/pathscale/libcxxrt.git libcxxrt @@ -32,3 +32,4 @@ cmake -DLIBCXX_CXX_ABI=libcxxrt \ make make install cd $cwd +set +v diff --git a/travis/install_linux.sh b/travis/install_linux.sh index 4dc51665c..17b9c10d5 100755 --- a/travis/install_linux.sh +++ b/travis/install_linux.sh @@ -1,4 +1,4 @@ -set -x +set -v mkdir $HOME/lib export LD_LIBRARY_PATH=$HOME/lib:$LD_LIBRARY_PATH mkdir $HOME/bin @@ -13,3 +13,4 @@ fi if [ "`echo $CXX`" == "clang++" ]; then export CXX=clang++-3.6 && travis/install_libcxx.sh fi +set +v diff --git a/travis/install_osx.sh b/travis/install_osx.sh index f4ea6b474..d2b990c2f 100755 --- a/travis/install_osx.sh +++ b/travis/install_osx.sh @@ -1,2 +1,3 @@ -set -x +set -v brew install icu4c +set +v From c17cbf8dc2ff40fe9a0418edceb1f5540bbdc3a5 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 11 Aug 2015 00:10:44 -0500 Subject: [PATCH 187/481] Exclude gcc build on OS X. --- .travis.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index e1998a55a..868917b3d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,8 +10,14 @@ os: - osx compiler: - - clang - - gcc + - clang + - gcc + +# we only care about clang on OS X +matrix: + exclude: + - os: osx + compiler: gcc addons: apt: From cc9c61c340eb5028c9392bae04dd48d407f83913 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 11 Aug 2015 00:15:27 -0500 Subject: [PATCH 188/481] Fix ambiguity with rotl/fmix on OS X. --- include/util/hash.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/util/hash.h b/include/util/hash.h index e999ea429..189317e9d 100644 --- a/include/util/hash.h +++ b/include/util/hash.h @@ -77,7 +77,9 @@ class murmur_hash<4> std::size_t operator()(const uint8_t* data, int len) const { - std::size_t out = seed_; + // this *has* to be uint32_t for OS X clang to correctly resolve + // between the two versions of rotl/fmix in namespace detail above. + uint32_t out = seed_; const auto nblocks = len / 4; From 3089cb4557b30c2c3f90a8be8a9cd7d2bb195cb6 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 11 Aug 2015 00:20:05 -0500 Subject: [PATCH 189/481] Fix another case where OS X has overload resolution trouble. --- src/index/metadata_writer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/index/metadata_writer.cpp b/src/index/metadata_writer.cpp index 44e0f1d19..4be4dd1f7 100644 --- a/src/index/metadata_writer.cpp +++ b/src/index/metadata_writer.cpp @@ -20,7 +20,9 @@ metadata_writer::metadata_writer(const std::string& prefix, uint64_t num_docs, schema_{std::move(schema)} { // write metadata header - byte_pos_ += io::packed::write(db_file_, schema_.size() + 2); + // cast below is needed for OS X overload resolution + byte_pos_ += io::packed::write(db_file_, + static_cast(schema_.size() + 2)); byte_pos_ += io::write_binary(db_file_, std::string{"length"}); byte_pos_ += io::write_binary(db_file_, corpus::metadata::field_type::UNSIGNED_INT); From ddea599cb170772515f8f9d069133c503f73c079 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 13 Aug 2015 16:33:16 -0500 Subject: [PATCH 190/481] Add configurable indexer RAM limits (fixes issue #103). A new config option, indexer-ram-budget in the global config scope, allows users to specify an estimated RAM budget for indexing. As high a value as possible is perferred as this speeds up indexing, but a sensible default of 1GB is provided. --- config.toml | 2 ++ include/index/chunk_handler.h | 10 +++++++--- include/index/chunk_handler.tcc | 15 ++++++++------- src/index/forward_index.cpp | 16 ++++++++++++---- src/index/inverted_index.cpp | 29 ++++++++++++++++++++++------- 5 files changed, 51 insertions(+), 21 deletions(-) diff --git a/config.toml b/config.toml index 488c2f4b0..5dc453826 100644 --- a/config.toml +++ b/config.toml @@ -12,6 +12,8 @@ dataset = "20newsgroups" corpus = "line.toml" # located inside dataset folder forward-index = "20news-fwd" inverted-index = "20news-inv" +indexer-ram-budget = 1024 # **estimated** RAM budget for indexing in MB + # always set this lower than your physical RAM! [[analyzers]] method = "ngram-word" diff --git a/include/index/chunk_handler.h b/include/index/chunk_handler.h index d427b017f..3f91365f9 100644 --- a/include/index/chunk_handler.h +++ b/include/index/chunk_handler.h @@ -49,8 +49,10 @@ class chunk_handler /** * @param parent A back-pointer to the handler this producer is * operating on + * @param ram_budget The **estimated** allowed size of the buffer + * for this producer */ - producer(chunk_handler* parent); + producer(chunk_handler* parent, uint64_t ram_budget); /** * Handler for when a given secondary_key has been processed and is @@ -84,7 +86,7 @@ class chunk_handler * This is an *estimate*, so you should make sure there's some slop * in this number to make sure you don't run out of memory. */ - const static uint64_t constexpr max_size = 1024 * 1024 * 256; // 256 MB + uint64_t max_size_; /// Back-pointer to the handler this producer is operating on chunk_handler* parent_; @@ -100,9 +102,11 @@ class chunk_handler * Creates a producer for this chunk_handler. Producers are designed to * be thread-local buffers of chunks that write to disk when their * buffer is full. + * @param ram_bugdet The estimated allowed size of this thread-local + * buffer * @return a new producer */ - producer make_producer(); + producer make_producer(uint64_t ram_budget); /** * @return the number of chunks this handler has written to disk. diff --git a/include/index/chunk_handler.tcc b/include/index/chunk_handler.tcc index 03ebee8fe..4a6cf817c 100644 --- a/include/index/chunk_handler.tcc +++ b/include/index/chunk_handler.tcc @@ -3,6 +3,7 @@ * @author Chase Geigle */ +#include #include #include "index/chunk_handler.h" @@ -15,12 +16,14 @@ namespace index { template -chunk_handler::producer::producer(chunk_handler* parent) - : parent_{parent} +chunk_handler::producer::producer(chunk_handler* parent, + uint64_t ram_budget) + : max_size_{ram_budget}, parent_{parent} { // sizeof(size_t): list size per bucket // sizeof(void*): head pointer per bucket chunk_size_ = pdata_.bucket_count() * (sizeof(size_t) + sizeof(void*)); + assert(chunk_size_ < max_size_); } template @@ -66,7 +69,7 @@ void chunk_handler::producer::operator()(const secondary_key_type& key, chunk_size_ += (it->bytes_used() + sizeof(void*)) / 4; } - if (chunk_size_ >= max_size) + if (chunk_size_ >= max_size_) flush_chunk(); } } @@ -104,9 +107,9 @@ chunk_handler::chunk_handler(const std::string& prefix) } template -auto chunk_handler::make_producer() -> producer +auto chunk_handler::make_producer(uint64_t ram_budget) -> producer { - return {this}; + return {this, ram_budget}; } template @@ -133,8 +136,6 @@ void chunk_handler::write_chunk(std::vector& pdata) for (auto& p : pdata) p.write_packed(outfile); } - std::ofstream termfile{chunk_name + ".numterms"}; - termfile << pdata.size(); pdata.clear(); std::lock_guard lock{mutables_}; diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 890cbb7be..812ce4f41 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -44,8 +44,10 @@ class forward_index::impl /** * @param inv_idx The inverted index to uninvert + * @param ram_budget The **estimated** allowed size of an in-memory + * chunk */ - void uninvert(const inverted_index& inv_idx); + void uninvert(const inverted_index& inv_idx, uint64_t ram_budget); /** * @param name The name of the inverted index to copy data from @@ -182,8 +184,13 @@ void forward_index::create_index(const std::string& config_file) } auto inv_idx = make_index(config_file); + uint64_t ram_budget = 1024; + if (auto cfg_ram_budget = config.get_as("indexer-ram-budget")) + ram_budget = static_cast(*cfg_ram_budget); + fwd_impl_->create_uninverted_metadata(inv_idx->index_name()); - fwd_impl_->uninvert(*inv_idx); + // RAM budget is given in MB + fwd_impl_->uninvert(*inv_idx, ram_budget * 1024 * 1024); impl_->load_term_id_mapping(); fwd_impl_->total_unique_terms_ = impl_->total_unique_terms(); } @@ -310,11 +317,12 @@ util::optional> return fwd_impl_->postings_->find_stream(d_id); } -void forward_index::impl::uninvert(const inverted_index& inv_idx) +void forward_index::impl::uninvert(const inverted_index& inv_idx, + uint64_t ram_budget) { chunk_handler handler{idx_->index_name()}; { - auto producer = handler.make_producer(); + auto producer = handler.make_producer(ram_budget); for (term_id t_id{0}; t_id < inv_idx.unique_terms(); ++t_id) { auto pdata = inv_idx.search_primary(t_id); diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 9c021b66e..39ef510f4 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -50,11 +50,12 @@ class inverted_index::impl * @param handler The chunk handler for this index * @param mdata_parser The parser for reading metadata * @param mdata_writer The writer for metadata + * @param ram_budget The total **estimated** RAM budget * @return the number of chunks created */ void tokenize_docs(corpus::corpus* docs, chunk_handler& handler, - metadata_writer& mdata_writer); + metadata_writer& mdata_writer, uint64_t ram_budget); /** * Compresses the large postings file. @@ -120,6 +121,13 @@ void inverted_index::create_index(const std::string& config_file) // load the documents from the corpus auto docs = corpus::corpus::load(config_file); + auto config = cpptoml::parse_file(config_file); + auto cfg_ram_budget = config.get_as("indexer-ram-budget"); + + uint64_t ram_budget = 1024; + if (cfg_ram_budget) + ram_budget = static_cast(*cfg_ram_budget); + chunk_handler handler{index_name()}; { metadata_writer mdata_writer{index_name(), docs->size(), @@ -127,7 +135,9 @@ void inverted_index::create_index(const std::string& config_file) uint64_t num_docs = docs->size(); impl_->load_labels(num_docs); - inv_impl_->tokenize_docs(docs.get(), handler, mdata_writer); + // RAM budget is given in megabytes + inv_impl_->tokenize_docs(docs.get(), handler, mdata_writer, + ram_budget * 1024 * 1024); } handler.merge_chunks(); @@ -163,14 +173,15 @@ void inverted_index::load_index() void inverted_index::impl::tokenize_docs(corpus::corpus* docs, chunk_handler& handler, - metadata_writer& mdata_writer) + metadata_writer& mdata_writer, + uint64_t ram_budget) { std::mutex mutex; printing::progress progress{" > Tokenizing Docs: ", docs->size()}; - auto task = [&]() + auto task = [&](uint64_t ram_budget) { - auto producer = handler.make_producer(); + auto producer = handler.make_producer(ram_budget); auto analyzer = analyzer_->clone(); while (true) { @@ -207,8 +218,12 @@ void inverted_index::impl::tokenize_docs(corpus::corpus* docs, parallel::thread_pool pool; std::vector> futures; - for (size_t i = 0; i < pool.thread_ids().size(); ++i) - futures.emplace_back(pool.submit_task(task)); + auto num_threads = pool.thread_ids().size(); + for (size_t i = 0; i < num_threads; ++i) + { + futures.emplace_back( + pool.submit_task(std::bind(task, ram_budget / num_threads))); + } for (auto& fut : futures) fut.get(); From d93999ad43136b3ecbb4b800405036b70a01aa39 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 14 Aug 2015 21:07:58 -0500 Subject: [PATCH 191/481] Improve memory utilization during indexing. In memory chunks are now stored using the same on-disk compressed format for the postings lists, but instead in in-memory byte buffers. These buffers are organized into a custom insert-only linear probing hash set, which allows us to have better control and measurement of memory utilization. This shaves off ~100 seconds when indexing Wikipedia on my local machine. The entire dataset can be indexed without a single flush before the final merge step with less than 8GB of ram. --- include/index/chunk.tcc | 2 +- include/index/chunk_handler.h | 9 +- include/index/chunk_handler.tcc | 85 ++++---- include/index/postings_buffer.h | 312 ++++++++++++++++++++++++++++ include/index/postings_data.h | 5 +- include/index/postings_data.tcc | 5 +- include/index/postings_file.h | 2 +- include/index/postings_stream.h | 117 ++++++----- include/util/probe_set.h | 358 ++++++++++++++++++++++++++++++++ 9 files changed, 801 insertions(+), 94 deletions(-) create mode 100644 include/index/postings_buffer.h create mode 100644 include/util/probe_set.h diff --git a/include/index/chunk.tcc b/include/index/chunk.tcc index dd4f37f25..908354887 100644 --- a/include/index/chunk.tcc +++ b/include/index/chunk.tcc @@ -64,7 +64,7 @@ void chunk::memory_merge_with(Container& pdata) ++terms; if (my_pd.primary_key() == other_pd->primary_key()) { - my_pd.merge_with(*other_pd); + my_pd.merge_with(other_pd->stream()); my_pd.write_packed(output); my_pd.read_packed(my_data); ++other_pd; diff --git a/include/index/chunk_handler.h b/include/index/chunk_handler.h index 3f91365f9..4bb956d64 100644 --- a/include/index/chunk_handler.h +++ b/include/index/chunk_handler.h @@ -15,12 +15,13 @@ #include #include #include -#include #include #include #include "index/chunk.h" +#include "index/postings_buffer.h" #include "util/optional.h" +#include "util/probe_set.h" namespace meta { @@ -39,6 +40,8 @@ class chunk_handler using primary_key_type = typename index_pdata_type::primary_key_type; using secondary_key_type = typename index_pdata_type::secondary_key_type; using chunk_t = chunk; + using postings_buffer_type + = postings_buffer; /** * The object that is fed postings_data by the index. @@ -76,7 +79,7 @@ class chunk_handler void flush_chunk(); /// Current in-memory chunk - std::unordered_set pdata_; + util::probe_set pdata_; /// Current size of the in-memory chunk uint64_t chunk_size_; @@ -142,7 +145,7 @@ class chunk_handler * @param pdata The collection of postings_data objects to combine into a * chunk */ - void write_chunk(std::vector& pdata); + void write_chunk(std::vector& pdata); /// The prefix for all chunks to be written std::string prefix_; diff --git a/include/index/chunk_handler.tcc b/include/index/chunk_handler.tcc index 4a6cf817c..dc08ce1e2 100644 --- a/include/index/chunk_handler.tcc +++ b/include/index/chunk_handler.tcc @@ -20,9 +20,7 @@ chunk_handler::producer::producer(chunk_handler* parent, uint64_t ram_budget) : max_size_{ram_budget}, parent_{parent} { - // sizeof(size_t): list size per bucket - // sizeof(void*): head pointer per bucket - chunk_size_ = pdata_.bucket_count() * (sizeof(size_t) + sizeof(void*)); + chunk_size_ = pdata_.bytes_used(); assert(chunk_size_ < max_size_); } @@ -33,40 +31,44 @@ void chunk_handler::producer::operator()(const secondary_key_type& key, { for (const auto& count : counts) { - index_pdata_type pd{count.first}; - pd.increase_count(key, count.second); - auto it = pdata_.find(pd); + postings_buffer_type pb{count.first}; + auto it = pdata_.find(pb); if (it == pdata_.end()) { - // sizeof(size_t): list size per bucket - // sizeof(void*): head pointer per bucket - chunk_size_ -= pdata_.bucket_count() - * (sizeof(size_t) + sizeof(void*)); - - // sizeof(void*): next pointer per element - chunk_size_ += pd.bytes_used() + sizeof(void*); - // 25% slop factor - chunk_size_ += (pd.bytes_used() + sizeof(void*)) / 4; - - pdata_.emplace(pd); - - // sizeof(size_t): list size per bucket - // sizeof(void*): head pointer per bucket - chunk_size_ += pdata_.bucket_count() - * (sizeof(size_t) + sizeof(void*)); + // check if we would resize on an insert + const auto& max_load_factor = pdata_.max_load_factor(); + if (max_load_factor.denominator * (pdata_.size() + 1) + >= max_load_factor.numerator * pdata_.capacity()) + { + // now check if roughly doubling our bytes used is going to + // cause problems + auto next_chunk_size = chunk_size_ + pdata_.bytes_used() + + pdata_.bytes_used() / 2; + if (next_chunk_size >= max_size_) + { + // if so, flush the current chunk before carrying on + flush_chunk(); + } + } + + chunk_size_ -= pdata_.bytes_used(); + + pb.write_count(key, static_cast(count.second)); + chunk_size_ += pb.bytes_used(); + pdata_.emplace(std::move(pb)); + + chunk_size_ += pdata_.bytes_used(); } else { - chunk_size_ -= it->bytes_used() + sizeof(void*); - chunk_size_ -= (it->bytes_used() + sizeof(void*)) / 4; + chunk_size_ -= it->bytes_used(); // note: we can modify elements in this set because we do not change // how comparisons are made (the primary_key value) - const_cast(*it) - .increase_count(key, count.second); + const_cast(*it) + .write_count(key, static_cast(count.second)); - chunk_size_ += it->bytes_used() + sizeof(void*); - chunk_size_ += (it->bytes_used() + sizeof(void*)) / 4; + chunk_size_ += it->bytes_used(); } if (chunk_size_ >= max_size_) @@ -80,17 +82,20 @@ void chunk_handler::producer::flush_chunk() if (pdata_.empty()) return; - std::vector pdata; - for (auto it = pdata_.begin(); it != pdata_.end(); it = pdata_.erase(it)) - pdata.emplace_back(std::move(*it)); - - pdata_.clear(); + // extract the keys, emptying the hash set + auto pdata = pdata_.extract_keys(); std::sort(pdata.begin(), pdata.end()); parent_->write_chunk(pdata); - // sizeof(size_t): list size per bucket - // sizeof(void*): head pointer per bucket - chunk_size_ = pdata_.bucket_count() * (sizeof(size_t) + sizeof(void*)); + chunk_size_ = pdata_.bytes_used(); + + // if the table itself is beyond the maximum chunk size, start over + // (this should rarely, if ever, happen) + if (chunk_size_ > max_size_) + { + decltype(pdata_){}.swap(pdata_); + chunk_size_ = pdata_.bytes_used(); + } } template @@ -113,7 +118,7 @@ auto chunk_handler::make_producer(uint64_t ram_budget) -> producer } template -void chunk_handler::write_chunk(std::vector& pdata) +void chunk_handler::write_chunk(std::vector& pdata) { auto chunk_num = chunk_num_.fetch_add(1); @@ -152,6 +157,12 @@ void chunk_handler::write_chunk(std::vector& pdata) namespace detail { +/** + * Represents an on-disk chunk to be merged with multi-way merge sort. Each + * input_chunk stores the file it's reading from, the total bytes needed to + * be read, and the current number of bytes read, as well as buffers in one + * postings. + */ template struct input_chunk { diff --git a/include/index/postings_buffer.h b/include/index/postings_buffer.h new file mode 100644 index 000000000..15aed6a99 --- /dev/null +++ b/include/index/postings_buffer.h @@ -0,0 +1,312 @@ +/** + * @file postings_buffer.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_INDEX_POSTINGS_BUFFER_H_ +#define META_INDEX_POSTINGS_BUFFER_H_ + +#include +#include +#include + +#include "index/postings_stream.h" +#include "io/packed.h" +#include "util/shim.h" + +namespace meta +{ +namespace index +{ + +namespace detail +{ +/** + * Gets the bytes used by a std::string. + */ +template +uint64_t bytes_used( + const T& elem, + typename std::enable_if::value>::type* = nullptr) +{ + return elem.capacity(); +} + +/** + * Gets the bytes used by anything not a std::string. + */ +template +uint64_t bytes_used( + const T& elem, + typename std::enable_if::value>::type* = nullptr) +{ + return sizeof(elem); +} +} + +/** + * Represents the postings list for an in-memory chunk assocated with a + * specific PrimaryKey (usually a std::string). Each postings_buffer stores + * the PrimaryKey, the total number of key, value pairs in the postings + * list, the total sum of the counts in the postings list, and a byte + * buffer that holds the compressed form of the postings list itself. This + * allows us to store significantly larger in-memory chunks than if we were + * to store the full materialized postings_data. + */ +template +class postings_buffer +{ + private: + using byte_type = uint8_t; + using buffer_type = std::vector; + using const_buffer_iterator = buffer_type::const_iterator; + + /// A simple input stream that reads from a buffer using an iterator + struct buffer_input_stream + { + buffer_input_stream(const_buffer_iterator it) : it_{it} + { + // nothing + } + + char get() + { + return *it_++; + } + + const_buffer_iterator it_; + }; + + public: + /** + * Creates a postings_buffer for a specific primary key. + */ + postings_buffer(PrimaryKey pk) : pk_(std::move(pk)) + { + // nothing + } + + /** + * @return the primary key for this postings_buffer + */ + const PrimaryKey& primary_key() const + { + return pk_; + } + + /** + * Writes a postings entry to the in-memory byte buffer in compressed + * format. + * @param id The SecondaryKey for the pair + * @param count The count value associated with the id + */ + template + void write_count(SecondaryKey id, FeatureValue count) + { + ++num_ids_; + total_counts_ += static_cast(count); + + assert(id > last_id_); + io::packed::write(buffer_, id - last_id_); + io::packed::write(buffer_, count); + + last_id_ = id; + } + + /** + * @return an estimate of the number of heap allocated bytes this + * structure uses + */ + std::size_t bytes_used() const + { + auto bytes = buffer_.size_; + + // this only matters when PrimaryKey is std::string. + // if the capacity of the string is bigger than the size of the + // string itself, then we know it must also be using heap memory, + // which we haven't accounted for already. + if (detail::bytes_used(pk_) > sizeof(PrimaryKey)) + bytes += detail::bytes_used(pk_); + return bytes; + } + + /** + * Writes this buffer directly to an output stream. + * @param os The output stream to write to + * @return the number of bytes written + */ + template + uint64_t write_packed(OutputStream& os) + { + auto bytes = io::packed::write(os, pk_); + bytes += io::packed::write(os, num_ids_); + bytes += io::packed::write(os, total_counts_); + + buffer_.write(os); + return bytes + buffer_.size_; + } + + /** + * @return a postings_stream to iterate over the byte buffer + */ + template + postings_stream stream() const + { + return {reinterpret_cast(buffer_.bytes_.get()), num_ids_, + total_counts_}; + } + + /** + * @param rhs The other buffer + * @return whether the primary key of this buffer is less than the + * primary key of the other buffer + */ + bool operator<(const postings_buffer& rhs) const + { + return pk_ < rhs.pk_; + } + + /** + * @param rhs The other buffer + * @return whether the primary keys of the two buffers are equal + */ + bool operator==(const postings_buffer& rhs) const + { + return pk_ == rhs.pk_; + } + + private: + /// A simple byte buffer that resizes with a 1.5x policy when full + struct char_buffer + { + /// Constructs an empty buffer + char_buffer() : size_{0}, pos_{0} + { + } + + /** + * Copies an existing buffer + * @param other The buffer to copy + */ + char_buffer(const char_buffer& other) + : size_{other.size_}, pos_{other.pos_} + { + if (other.bytes_) + { + bytes_ = make_unique(size_); + std::copy(other.bytes_.get(), other.bytes_.get() + pos_, + bytes_.get()); + } + } + + /// char_buffer can be move constructed + char_buffer(char_buffer&&) = default; + + /** + * @param rhs The buffer to assign into this one + * @return the current buffer + */ + char_buffer& operator=(const char_buffer& rhs) + { + char_buffer copy{rhs}; + swap(copy); + return *this; + } + + /// char_buffer can be move assigned + char_buffer& operator=(char_buffer&&) = default; + + /** + * Swaps the current buffer with the argument + * @param other The buffer to swap with + */ + void swap(char_buffer& other) + { + using std::swap; + swap(size_, other.size_); + swap(pos_, other.pos_); + swap(bytes_, other.bytes_); + } + + /** + * Writes a single byte to the buffer, resizing if needed. + * @param byte the byte to write + */ + void put(uint8_t byte) + { + if (size_ == pos_) + resize(); + bytes_[pos_] = byte; + ++pos_; + } + + /** + * Resizes the buffer to 1.5x its old size. + */ + void resize() + { + if (size_ == 0) + { + size_ = 8; + } + else + { + // 1.5x resize + size_ += (size_ + 1) / 2; + } + + auto newbytes = make_unique(size_); + std::copy(bytes_.get(), bytes_.get() + pos_, newbytes.get()); + std::swap(newbytes, bytes_); + } + + /** + * Writes all the bytes in this buffer to the output stream + * @param os The output stream to write to. + */ + template + void write(OutputStream& os) const + { + os.write(reinterpret_cast(bytes_.get()), pos_); + } + + /// The bytes in this buffer + std::unique_ptr bytes_; + /// The current size of the buffer + std::size_t size_; + /// The current byte position in the buffer + std::size_t pos_; + + } buffer_; + + /// The primary key for the buffer + PrimaryKey pk_; + /// The last id we wrote + SecondaryKey last_id_ = SecondaryKey{0}; + /// The total number of ids we've written + uint64_t num_ids_ = 0; + /// The sum of the counts we've written + uint64_t total_counts_ = 0; +}; +} +} + +namespace std +{ +template +struct hash> +{ + using pbuffer_type = meta::index::postings_buffer; + std::size_t operator()(const pbuffer_type& pbuffer) const + { + return std::hash{}(pbuffer.primary_key()); + } +}; +} +#endif diff --git a/include/index/postings_data.h b/include/index/postings_data.h index 2b514c4b6..cfecd6891 100644 --- a/include/index/postings_data.h +++ b/include/index/postings_data.h @@ -92,10 +92,11 @@ class postings_data postings_data& operator=(postings_data&&) = default; /** - * @param other The other postings_data object to consume + * @param cont The other container (of SecondaryKey, count pairs) to merge * Adds the parameter's data to this object's data */ - void merge_with(postings_data& other); + template + void merge_with(Container&& cont); /** * @param s_id The SecondaryKey's id to add counts for diff --git a/include/index/postings_data.tcc b/include/index/postings_data.tcc index 526c09bac..80076cc56 100644 --- a/include/index/postings_data.tcc +++ b/include/index/postings_data.tcc @@ -21,7 +21,8 @@ postings_data::postings_data(PrimaryKey p_id) } template -void postings_data::merge_with(postings_data& other) +template +void postings_data::merge_with(Container&& cont) { auto searcher = [](const pair_t& p, const SecondaryKey& s) { @@ -32,7 +33,7 @@ void postings_data::merge_with(postings_data& other) // if the primary_key doesn't exist, add onto back uint64_t orig_length = counts_.size(); - for (auto& p : other.counts_) + for (auto& p : cont) { auto it = std::lower_bound( counts_.begin(), counts_.begin() + orig_length, p.first, searcher); diff --git a/include/index/postings_file.h b/include/index/postings_file.h index 284af2eac..3a0e068f3 100644 --- a/include/index/postings_file.h +++ b/include/index/postings_file.h @@ -54,7 +54,7 @@ class postings_file { if (pk < byte_locations_.size()) return postings_stream{ - postings_, byte_locations_.at(pk)}; + postings_.begin() + byte_locations_.at(pk)}; return util::nullopt; } diff --git a/include/index/postings_stream.h b/include/index/postings_stream.h index 9f3c68620..6a7189a49 100644 --- a/include/index/postings_stream.h +++ b/include/index/postings_stream.h @@ -12,9 +12,8 @@ #include #include +#include -#include "io/mmap_file.h" -#include "io/compressed_file_reader.h" #include "util/optional.h" #include "io/packed.h" @@ -49,19 +48,31 @@ class postings_stream public: /** - * Creates a postings stream reading from the given file at the given - * byte position. + * Creates a postings stream reading from the given buffer. Assumes + * that the size and total counts are the first two values in the + * buffer. * - * @param file The file that contains the postings lists - * @param seek_pos The position in the file to begin reading from + * @param buffer The buffer position to the start of the postings */ - postings_stream(const io::mmap_file& file, uint64_t seek_pos) - : file_{&file}, seek_pos_{seek_pos} + postings_stream(const char* buffer) : start_{buffer} { - char_input_stream stream{file_->begin() + seek_pos_}; + char_input_stream stream{start_}; io::packed::read(stream, size_); io::packed::read(stream, total_counts_); + start_ = stream.input_; + } + + /** + * Creates a postings stream reading from the given buffer. Assumes + * that the very first value in the buffer is the start of the + * postings, since the size and total counts are provided on + * construction. + */ + postings_stream(const char* buffer, uint64_t size, uint64_t total_counts) + : start_{buffer}, size_{size}, total_counts_{total_counts} + { + // nothing } /** @@ -81,6 +92,24 @@ class postings_stream return total_counts_; } + /** + * Writes this postings stream to an output stream in packed format. + * @return the number of bytes written + */ + template + uint64_t write_packed(OutputStream& os) const + { + auto bytes = io::packed::write(os, size_); + bytes += io::packed::write(os, total_counts_); + for (const auto& pr : *this) + { + bytes += io::packed::write(os, pr.first); + bytes + += io::packed::write(os, static_cast(pr.second)); + } + return bytes; + } + /** * An iterator over the (SecondaryKey, double) pairs of this postings * list. @@ -96,40 +125,37 @@ class postings_stream friend postings_stream; - iterator() : size_{0}, pos_{0} + iterator() : stream_{nullptr}, size_{0}, pos_{0} { // nothing } iterator& operator++() { - if (stor_) + if (pos_ == size_) { - if (pos_ == size_) + stream_ = {nullptr}; + size_ = 0; + pos_ = 0; + } + else + { + uint64_t id; + io::packed::read(stream_, id); + // gap encoding + count_.first += id; + + if (std::is_same::value) { - stor_ = util::nullopt; - pos_ = 0; - size_ = 0; + uint64_t next; + io::packed::read(stream_, next); + count_.second = static_cast(next); } else { - uint64_t id; - io::packed::read(*stream_, id); - // gap encoding - stor_->first += id; - - if (std::is_same::value) - { - uint64_t next; - io::packed::read(*stream_, next); - stor_->second = static_cast(next); - } - else - { - io::packed::read(*stream_, stor_->second); - } - ++pos_; + io::packed::read(stream_, count_.second); } + ++pos_; } return *this; } @@ -143,18 +169,18 @@ class postings_stream reference operator*() const { - return *stor_; + return count_; } pointer operator->() const { - return &(*stor_); + return &count_; } bool operator==(const iterator& other) { - return std::tie(stor_, size_, pos_) - == std::tie(other.stor_, other.size_, other.pos_); + return std::tie(stream_.input_, size_, pos_) + == std::tie(other.stream_.input_, other.size_, other.pos_); } bool operator!=(const iterator& other) @@ -163,23 +189,19 @@ class postings_stream } private: - iterator(const io::mmap_file& file, uint64_t seek_pos) - : stream_{file.begin() + seek_pos}, + iterator(const char* start, uint64_t size) + : stream_{start}, + size_{size}, pos_{0}, - stor_{std::make_pair(SecondaryKey{0}, 0.0)} + count_{std::make_pair(SecondaryKey{0}, 0.0)} { - io::packed::read(*stream_, size_); - - // ignore total counts - uint64_t total_counts; - io::packed::read(*stream_, total_counts); ++(*this); } - util::optional stream_; + char_input_stream stream_; uint64_t size_; uint64_t pos_; - util::optional> stor_; + std::pair count_; }; /** @@ -187,7 +209,7 @@ class postings_stream */ iterator begin() const { - return {*file_, seek_pos_}; + return {start_, size_}; } /** @@ -199,8 +221,7 @@ class postings_stream } private: - const io::mmap_file* file_; - uint64_t seek_pos_; + const char* start_; uint64_t size_; uint64_t total_counts_; }; diff --git a/include/util/probe_set.h b/include/util/probe_set.h new file mode 100644 index 000000000..b690cc09c --- /dev/null +++ b/include/util/probe_set.h @@ -0,0 +1,358 @@ +/** + * @file probe_set.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_UTIL_PROBE_SET_H_ +#define META_UTIL_PROBE_SET_H_ + +#include +#include +#include +#include +#include +#include + +namespace meta +{ +namespace util +{ + +namespace detail +{ +/** + * @param num The number to round to the next prime number + * @return a prime number greater than num + */ +inline uint64_t next_prime(uint64_t num) +{ + // list of primes for resizing. "borrowed" from boost::unordered. + static uint64_t primes[] + = {17ul, 29ul, 37ul, 53ul, 67ul, + 79ul, 97ul, 131ul, 193ul, 257ul, + 389ul, 521ul, 769ul, 1031ul, 1543ul, + 2053ul, 3079ul, 6151ul, 12289ul, 24593ul, + 49157ul, 98317ul, 196613ul, 393241ul, 786433ul, + 1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul, + 50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul, + 1610612741ul, 3221225473ul, 4294967291ul}; + + auto prime = std::upper_bound(std::begin(primes), std::end(primes), num); + if (prime == std::end(primes)) + --prime; + return *prime; +} +} + +/** + * An **insert-only** linear probing hash set. The keys are stored in + * contiguous memory, and the table itself maps to an index into the key + * storage. + * + * The primary use case for this is for storing in-memory chunks of + * postings data during indexing, but it could easily be used in other + * places. + */ +template , + class KeyEqual = std::equal_to> +class probe_set +{ + public: + /** + * An object to represent load factors for the table. It is represented + * as a fraction instead of a floating point number to avoid floating + * point operations when checking for the resize condition. + * + * Its default value is 3/4, meaning that the table will resize when + * more than 3/4 of the cells would become occupied after an insert. + */ + struct load_factor + { + uint64_t numerator = 3; + uint64_t denominator = 4; + }; + + /** + * Constructs an empty probe_set. + * @param alpha The desired load factor (optional) + */ + probe_set(load_factor alpha = {}) + : table_(17), occupancy_(17), alpha_(alpha) + { + // nothing + } + + /** + * An iterator over the set. Elements cannot be modified through this + * iterator. + */ + class const_iterator + { + public: + friend probe_set; + + /** + * Iterators can be default constructed. + */ + const_iterator() = default; + + /** + * @return the current iterator + */ + const_iterator& operator++() + { + do + { + ++idx_; + } while (idx_ < parent_->occupancy_.size() + && !parent_->occupancy_[idx_]); + return *this; + } + + /** + * @return the key pointed to by this iterator + */ + const Key& operator*() const + { + return parent_->keys_[parent_->table_[idx_]]; + } + + /** + * @return a pointer to the key pointed to by this iterator + */ + const Key* operator->() const + { + return &parent_->keys_[parent_->table_[idx_]]; + } + + /** + * @param rhs The other iterator + * @return whether this iterator is equal to the other + */ + bool operator==(const const_iterator& rhs) + { + return parent_ == rhs.parent_ && idx_ == rhs.idx_; + } + + /** + * @param rhs The other iterator + * @return whether this iterator is not equal to the other + */ + bool operator!=(const const_iterator& rhs) + { + return !(*this == rhs); + } + + private: + /** + * The private constructor used by the probe_set to create its + * begin and end iterators. + * + * @param parent The probe set to iterate over + * @param idx The starting index into the table + */ + const_iterator(const probe_set* parent, std::size_t idx) + : parent_{parent}, idx_{idx} + { + if (idx < parent_->occupancy_.size() && !parent_->occupancy_[idx]) + ++(*this); + } + + /// The probe_set this iterator is iterating over + const probe_set* parent_; + + /// The current index into the table + std::size_t idx_; + }; + /// Regular iterators are the same as const_iterators. + using iterator = const_iterator; + + /** + * @return an iterator to the beginning of the set + */ + const_iterator begin() const + { + return {this, 0}; + } + + /** + * @return an iterator to the end of the set + */ + const_iterator end() const + { + return {this, occupancy_.size()}; + } + + /** + * @param key an rvalue reference to the key to be inserted into the + * table + */ + void emplace(Key&& key) + { + if (alpha_.denominator * (keys_.size() + 1) >= alpha_.numerator + * occupancy_.size()) + resize(); + + auto idx = hash_(key) % occupancy_.size(); + while (occupancy_[idx]) + idx = (idx + 1) % occupancy_.size(); + + occupancy_[idx] = true; + table_[idx] = keys_.size(); + + // hack for a 1.5x resizing vector on all implementations + if (keys_.size() == keys_.capacity()) + keys_.reserve(keys_.size() + (keys_.size() + 1) / 2); + keys_.emplace_back(std::move(key)); + } + + /** + * @param key The key to locate in the set + * @return an iterator to the key, if it exists, or to the end if it + * doesn't + */ + const_iterator find(const Key& key) const + { + auto idx = hash_(key) % occupancy_.size(); + while (occupancy_[idx] && !(equal_(keys_[table_[idx]], key))) + idx = (idx + 1) % occupancy_.size(); + + if (!occupancy_[idx]) + return end(); + + return {this, idx}; + } + + /** + * Empties the set. This releases the memory associated with the keys + * in the set, but keeps the memory associated with the table itself. + */ + void clear() + { + // actually free all of the data, but keep around the actual table + // itself + std::vector{}.swap(keys_); + + // remember to mark everything as unoccupied + std::fill(occupancy_.begin(), occupancy_.end(), false); + } + + /** + * @param other The set to swap with + */ + void swap(probe_set& other) + { + using std::swap; + swap(table_, other.table_); + swap(occupancy_, other.occupancy_); + swap(keys_, other.keys_); + swap(hash_, other.hash_); + swap(equal_, other.equal_); + } + + /** + * @return whether the table is empty + */ + bool empty() const + { + return keys_.empty(); + } + + /** + * @return the current number of keys in the set + */ + std::size_t size() const + { + return keys_.size(); + } + + /** + * @return the current number of elements that can be stored in the + * table itself + */ + std::size_t capacity() const + { + return occupancy_.size(); + } + + /** + * @return an estimate for the number of heap allocated bytes used by + * the container + */ + std::size_t bytes_used() const + { + return sizeof(std::size_t) * table_.capacity() + occupancy_.capacity() + + sizeof(Key) * keys_.capacity() + sizeof(load_factor) + + sizeof(Hash) + sizeof(KeyEqual); + } + + /** + * @return the maximum allowed load factor for this table + */ + const load_factor& max_load_factor() const + { + return alpha_; + } + + /** + * This empties the hash table and returns the contiguous storage used + * to store the keys. + * @return all of the keys in the table + */ + std::vector extract_keys() + { + auto res = std::move(keys_); + clear(); + return res; + } + + private: + /** + * Increases the capacity of the table by resizing to twice the size + * rounded up to the closest prime number. + */ + void resize() + { + std::vector newocc(detail::next_prime(occupancy_.size() * 2)); + std::vector newtable(newocc.size()); + + for (std::size_t idx = 0; idx < occupancy_.size(); ++idx) + { + if (occupancy_[idx]) + { + auto& key = keys_[table_[idx]]; + auto nidx = hash_(key) % newocc.size(); + + while (newocc[nidx]) + nidx = (nidx + 1) % newocc.size(); + + newocc[nidx] = true; + newtable[nidx] = table_[idx]; + } + } + + using std::swap; + swap(newocc, occupancy_); + swap(newtable, table_); + } + + /// The table itself, which maps to indices into the keys storage + std::vector table_; + /// Whether a specific location in the table is occupied or not + std::vector occupancy_; + /// The contiguous storage used for holding the keys + std::vector keys_; + /// The maximum allowed load factor for the table + load_factor alpha_; + /// The hash function used for hashing the keys + Hash hash_; + /// The comparator used for testing keys for equality + KeyEqual equal_; +}; +} +} +#endif From b52587d70e3921c1f5c54e8de4e8572080959792 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 14 Aug 2015 21:37:25 -0500 Subject: [PATCH 192/481] Bump cpptoml to latest version. --- deps/cpptoml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/cpptoml b/deps/cpptoml index a8e0b0e4d..2dba80d3f 160000 --- a/deps/cpptoml +++ b/deps/cpptoml @@ -1 +1 @@ -Subproject commit a8e0b0e4d26ac8ffdff2ab2edb4eb9006cdf48f4 +Subproject commit 2dba80d3f1e87a3d4a5e01fb13494d2553e30df8 From d0685615d802880e84a7030a52c298f26f1f090f Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 14 Aug 2015 21:54:52 -0500 Subject: [PATCH 193/481] Fix make_unique shim for T[]. --- include/util/shim.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/include/util/shim.h b/include/util/shim.h index c438995ac..cf697a4bc 100644 --- a/include/util/shim.h +++ b/include/util/shim.h @@ -18,16 +18,38 @@ namespace meta { #ifndef META_HAS_STD_MAKE_UNIQUE + +namespace detail +{ +template +struct unique_if +{ + using single_object = std::unique_ptr; +}; + +template +struct unique_if +{ + using unknown_bound = std::unique_ptr; +}; +} + /** * Constructs a unique ptr in place. * @param args The parameters to the constructor * @return a unique_ptr */ template -std::unique_ptr make_unique(Args&&... args) +typename detail::unique_if::single_object make_unique(Args&&... args) { return std::unique_ptr{new T(std::forward(args)...)}; } + +template +typename detail::unique_if::unknown_bound make_unique(std::size_t size) +{ + return std::unique_ptr{new typename std::remove_extent::type[size]()}; +} #else using std::make_unique; #endif From ee404afb802ab9b657f8eb60bbe43685a7acaccf Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 14 Aug 2015 21:56:26 -0500 Subject: [PATCH 194/481] Fix assertion in postings_buffer. --- include/index/postings_buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/index/postings_buffer.h b/include/index/postings_buffer.h index 15aed6a99..f6c4a79fd 100644 --- a/include/index/postings_buffer.h +++ b/include/index/postings_buffer.h @@ -112,7 +112,7 @@ class postings_buffer ++num_ids_; total_counts_ += static_cast(count); - assert(id > last_id_); + assert(id >= last_id_); io::packed::write(buffer_, id - last_id_); io::packed::write(buffer_, count); From e2bfdaf3201a77f8d7dbd85a3ea55f82dd8c5744 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 14 Aug 2015 22:02:35 -0500 Subject: [PATCH 195/481] Remove C++14 assumptions from util::string_view. --- include/util/string_view.h | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/include/util/string_view.h b/include/util/string_view.h index 208059d54..1e8fe980e 100644 --- a/include/util/string_view.h +++ b/include/util/string_view.h @@ -141,7 +141,7 @@ class basic_string_view return data_[pos]; } - constexpr const_reference at(size_type pos) const + const_reference at(size_type pos) const { if (pos >= size()) throw std::out_of_range{"index out of bounds"}; @@ -163,24 +163,24 @@ class basic_string_view return data_; } - constexpr void clear() noexcept + void clear() noexcept { data_ = nullptr; size_ = 0; } - constexpr void remove_prefix(size_type n) + void remove_prefix(size_type n) { data_ += n; size_ -= n; } - constexpr void remove_suffix(size_type n) + void remove_suffix(size_type n) { size_ -= n; } - constexpr void swap(basic_string_view& s) noexcept + void swap(basic_string_view& s) noexcept { using ::std::swap; swap(data_, s.data_); @@ -220,8 +220,7 @@ class basic_string_view constexpr int compare(basic_string_view s) const noexcept { - constexpr auto rlen = std::min(size(), s.size()); - return Traits::compare(data(), s.data(), rlen); + return Traits::compare(data(), s.data(), std::min(size(), s.size())); } constexpr int compare(size_type pos1, size_type n1, @@ -252,8 +251,7 @@ class basic_string_view return substr(pos1, n1).compare(basic_string_view{s, n2}); } - constexpr size_type find(basic_string_view s, size_type pos = 0) const - noexcept + size_type find(basic_string_view s, size_type pos = 0) const noexcept { if (pos >= size()) return npos; @@ -280,8 +278,7 @@ class basic_string_view return find(basic_string_view{s}, pos); } - constexpr size_type rfind(basic_string_view s, size_type pos = npos) const - noexcept + size_type rfind(basic_string_view s, size_type pos = npos) const noexcept { if (size() < s.size()) return npos; @@ -315,8 +312,8 @@ class basic_string_view return rfind(basic_string_view{s}, pos); } - constexpr size_type find_first_of(basic_string_view s, - size_type pos = 0) const noexcept + size_type find_first_of(basic_string_view s, size_type pos = 0) const + noexcept { if (pos >= size()) return npos; @@ -344,8 +341,8 @@ class basic_string_view return find_first_of(basic_string_view{s}, pos); } - constexpr size_type find_last_of(basic_string_view s, - size_type pos = npos) const noexcept + size_type find_last_of(basic_string_view s, size_type pos = npos) const + noexcept { if (pos >= size()) return npos; @@ -375,8 +372,8 @@ class basic_string_view return find_last_of(basic_string_view{s}, pos); } - constexpr size_type find_first_not_of(basic_string_view s, - size_type pos = 0) const noexcept + size_type find_first_not_of(basic_string_view s, size_type pos = 0) const + noexcept { if (pos >= size()) return npos; @@ -409,8 +406,8 @@ class basic_string_view return find_first_not_of(basic_string_view{s}, pos); } - constexpr size_type find_last_not_of(basic_string_view s, - size_type pos = npos) const noexcept + size_type find_last_not_of(basic_string_view s, size_type pos = npos) const + noexcept { if (pos >= size()) return npos; From 168df7d9915912d5ac733c4e63e75e34527edb9b Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 14 Aug 2015 22:13:08 -0500 Subject: [PATCH 196/481] Build in Release mode if no build type specified. --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4eb2f7846..60c3f752b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,11 @@ include(CheckCXXSourceRuns) include(ExternalProject) include(FindZLIB) +if (NOT CMAKE_BUILD_TYPE) + message("-- No build type selected, defaulting to Release") + set(CMAKE_BUILD_TYPE "Release") +endif() + include_directories(include/) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/deps/findicu) From 8f7943fe0361e41485a6f9282e9f0439b23f9807 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 14 Aug 2015 22:13:35 -0500 Subject: [PATCH 197/481] Increase rankers test time out for GCC. (For some reason GCC really makes this stupidly slow in debug mode.) --- src/test/unit_tests.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/unit_tests.cmake b/src/test/unit_tests.cmake index 418f0cf5f..e2f822027 100644 --- a/src/test/unit_tests.cmake +++ b/src/test/unit_tests.cmake @@ -35,7 +35,7 @@ set_tests_properties(classifiers PROPERTIES TIMEOUT 80 WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) add_test(rankers ${UNIT_TEST_EXE} rankers) -set_tests_properties(rankers PROPERTIES TIMEOUT 75 WORKING_DIRECTORY +set_tests_properties(rankers PROPERTIES TIMEOUT 90 WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) add_test(ir-eval ${UNIT_TEST_EXE} ir-eval) From e84e3d852c16f2b88f6d72a2a9b79da50e1f0d9b Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 15 Aug 2015 00:40:00 -0500 Subject: [PATCH 198/481] Fix string_view::compare() being totally broken. --- include/util/string_view.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/include/util/string_view.h b/include/util/string_view.h index 1e8fe980e..6cb0629d4 100644 --- a/include/util/string_view.h +++ b/include/util/string_view.h @@ -218,9 +218,20 @@ class basic_string_view : basic_string_view{data() + pos, std::min(n, size() - pos)}; } - constexpr int compare(basic_string_view s) const noexcept + int compare(basic_string_view s) const noexcept { - return Traits::compare(data(), s.data(), std::min(size(), s.size())); + auto cmp + = Traits::compare(data(), s.data(), std::min(size(), s.size())); + if (cmp != 0) + return cmp; + + if (size() < s.size()) + return -1; + + if (size() == s.size()) + return 0; + + return 1; } constexpr int compare(size_type pos1, size_type n1, @@ -603,10 +614,11 @@ namespace std template struct hash> { + meta::util::murmur_hash<> hasher; + size_t operator()( const meta::util::basic_string_view& view) const noexcept { - static meta::util::murmur_hash<> hasher{}; return hasher(reinterpret_cast(view.data()), view.size()); } From 94c7926ee6645307663b9a0889377141f7c396ce Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 15 Aug 2015 11:14:44 -0500 Subject: [PATCH 199/481] Add function for fetching metdata of a document. Fixes issue #98. --- include/index/disk_index.h | 7 +++++++ src/index/disk_index.cpp | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/include/index/disk_index.h b/include/index/disk_index.h index 9e42d5572..812071a3a 100644 --- a/include/index/disk_index.h +++ b/include/index/disk_index.h @@ -13,6 +13,7 @@ #include #include +#include "corpus/metadata.h" #include "util/pimpl.h" #include "meta.h" @@ -130,6 +131,12 @@ class disk_index */ std::vector class_labels() const; + /** + * @param d_id The document id to fetch metadata for + * @return the metadata associated with this document id + */ + corpus::metadata metadata(doc_id d_id) const; + /** * @param d_id * @return the number of unique terms in d_id diff --git a/src/index/disk_index.cpp b/src/index/disk_index.cpp index c63b2bc0e..d75620284 100644 --- a/src/index/disk_index.cpp +++ b/src/index/disk_index.cpp @@ -73,6 +73,11 @@ std::vector disk_index::class_labels() const return impl_->class_labels(); } +corpus::metadata disk_index::metadata(doc_id d_id) const +{ + return impl_->metadata_->get(d_id); +} + uint64_t disk_index::unique_terms(doc_id d_id) const { return *impl_->metadata_->get(d_id).get("unique-terms"); From b7d31eb775d40ab6a67e5d4b47f5516e409dd233 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 15 Aug 2015 11:15:08 -0500 Subject: [PATCH 200/481] Add STDOPT to required compiler options for meta-definitions. --- CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 60c3f752b..b5dfae650 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,6 +130,10 @@ endif() add_library(meta-definitions INTERFACE) target_include_directories(meta-definitions INTERFACE ${PROJECT_SOURCE_DIR}/include) +if (STDOPT) + target_compile_options(meta-definitions INTERFACE ${STDOPT}) +endif() + if(ZLIB_FOUND) target_include_directories(meta-definitions INTERFACE ${ZLIB_INCLUDE_DIRS}) From 3396cae0bc564078f886406d1a02befb612b7d71 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 15 Aug 2015 11:15:41 -0500 Subject: [PATCH 201/481] Minor optimizations for avoiding string compares. --- src/analyzers/filters/alpha_filter.cpp | 8 ++++++-- src/analyzers/filters/length_filter.cpp | 3 ++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/analyzers/filters/alpha_filter.cpp b/src/analyzers/filters/alpha_filter.cpp index d0f33e3cf..bbeb70a3c 100644 --- a/src/analyzers/filters/alpha_filter.cpp +++ b/src/analyzers/filters/alpha_filter.cpp @@ -46,14 +46,18 @@ void alpha_filter::next_token() while (*source_) { auto tok = source_->next(); - if (tok == "" || tok == "") + if (tok.size() <= 4 && tok.size() >= 3 + && (tok == "" || tok == "")) { token_ = std::move(tok); return; } auto filt = utf::remove_if(tok, [](uint32_t codepoint) - { return !utf::isalpha(codepoint) && codepoint != '\''; }); + { + return !utf::isalpha(codepoint) + && codepoint != '\''; + }); if (!filt.empty()) { token_ = std::move(filt); diff --git a/src/analyzers/filters/length_filter.cpp b/src/analyzers/filters/length_filter.cpp index 40944180f..fd6ace27e 100644 --- a/src/analyzers/filters/length_filter.cpp +++ b/src/analyzers/filters/length_filter.cpp @@ -65,7 +65,8 @@ void length_filter::next_token() while (*source_) { auto tok = source_->next(); - if (tok == "" || tok == "") + if (tok.size() <= 4 && tok.size() >= 3 + && (tok == "" || tok == "")) { token_ = std::move(tok); return; From 4784b0f0e974c0f65d8b926caca839e54ef0e3d3 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 15 Aug 2015 11:19:25 -0500 Subject: [PATCH 202/481] Cosmetic cleanups for some metadata functions. --- src/index/disk_index.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/index/disk_index.cpp b/src/index/disk_index.cpp index d75620284..798d2ed68 100644 --- a/src/index/disk_index.cpp +++ b/src/index/disk_index.cpp @@ -80,7 +80,7 @@ corpus::metadata disk_index::metadata(doc_id d_id) const uint64_t disk_index::unique_terms(doc_id d_id) const { - return *impl_->metadata_->get(d_id).get("unique-terms"); + return *metadata(d_id).get("unique-terms"); } uint64_t disk_index::unique_terms() const @@ -90,7 +90,7 @@ uint64_t disk_index::unique_terms() const uint64_t disk_index::doc_size(doc_id d_id) const { - return *impl_->metadata_->get(d_id).get("length"); + return *metadata(d_id).get("length"); } uint64_t disk_index::num_docs() const From ccc16c850d72936cbff58ee7b217eb11ab2207de Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 17 Aug 2015 18:45:10 -0500 Subject: [PATCH 203/481] add missing include for certain compiler versions --- src/features/feature_selector.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp index 438522957..0b46e2924 100644 --- a/src/features/feature_selector.cpp +++ b/src/features/feature_selector.cpp @@ -5,6 +5,7 @@ #include #include +#include #include "util/filesystem.h" #include "util/progress.h" #include "parallel/parallel_for.h" From acb0df6509fe9746394448de3f7d9afa51f55cce Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 19 Aug 2015 13:26:25 -0500 Subject: [PATCH 204/481] usability updates to language_model and diff --- include/lm/diff.h | 39 ++++++++++++++++- src/lm/diff.cpp | 89 +++++++++++++++++++++------------------ src/lm/language_model.cpp | 6 ++- src/lm/tools/lm-test.cpp | 15 ------- 4 files changed, 90 insertions(+), 59 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index f7b1f9b2b..f0205bab5 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -27,8 +27,33 @@ class diff */ diff(const cpptoml::table& config); + /** + * Default move constructor. + */ diff(diff&&) = default; + /** + * @param sent The sentence object to inspect + * @return the index of the least-likely ngram according + */ + uint64_t least_likely_ngram(const sentence& sent) const; + + /** + * @return the order of the LM used by this diff object + */ + uint64_t n_val() const + { + return n_val_; + } + + /** + * @return the language model used by this diff object + */ + const language_model& lm() const + { + return lm_; + } + /** * @param sent The sentence to transform * @param use_lm @@ -102,6 +127,7 @@ class diff language_model lm_; + /// The order of the language model uint64_t n_val_; uint64_t max_edits_; @@ -110,15 +136,24 @@ class diff double substitute_penalty_; double remove_penalty_; + /// Chooses whether to do edits at a low-probability location in the + /// sentence determined by / a LM; if false, edits are performed at every + /// index. bool use_lm_; + std::unordered_map> stems_; std::vector fwords_; std::unordered_set seen_; + /// How many candidate sentences to store. static constexpr uint64_t max_cand_size_ = 20; - /// balance between perplexity and edit weights; doesn't necessarily matter - /// since penalty weights will scale with different values of lambda + + /// Balances perplexity and edit weights. static constexpr double lambda_ = 0.5; + + /// Whether to insert likely words based on the language model. This is + /// currently turned off due to the LM representation making it inefficient. + static constexpr bool lm_generate_ = false; }; class diff_exception : public std::runtime_error diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 7bdebaa40..98061f7db 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -81,63 +81,72 @@ void diff::add(PQ& candidates, const sentence& sent) candidates.pop(); } -template -void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) +uint64_t diff::least_likely_ngram(const sentence& sent) const { - if (sent.size() < n_val_) - return; - double min_prob = 1; uint64_t best_idx = 0; - sentence best; - for (uint64_t i = n_val_ - 1; i < sent.size(); ++i) + for (uint64_t i = n_val_; i < sent.size(); ++i) { - auto ngram = sent(i - (n_val_ - 1), i + 1); + auto ngram = sent(i - n_val_, i); auto prob = lm_.log_prob(ngram); if (prob < min_prob) { min_prob = prob; - best_idx = i; - best = ngram; + best_idx = i - 1; } } - insert(sent, best_idx, candidates, depth); - remove(sent, best_idx, candidates, depth); - substitute(sent, best_idx, candidates, depth); - - best.pop_back(); - try - { - for (auto& next : lm_.top_k(best, 3)) - { - if (next.first == "") - continue; - - sentence ins_cpy{sent}; - ins_cpy.insert(best_idx, next.first, - base_penalty_ + insert_penalty_); + return best_idx; +} - if (seen_.find(ins_cpy.to_string()) == seen_.end()) - { - add(candidates, ins_cpy); - step(ins_cpy, candidates, depth + 1); - } +template +void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) +{ + auto best_idx = least_likely_ngram(sent); - sentence sub_cpy{sent}; - sub_cpy.substitute(best_idx, next.first, - base_penalty_ + substitute_penalty_); + for (uint64_t i = 0; i < n_val_ && i < best_idx; ++i) + { + insert(sent, best_idx - i, candidates, depth); + remove(sent, best_idx - i, candidates, depth); + substitute(sent, best_idx - i, candidates, depth); + } - if (seen_.find(sub_cpy.to_string()) == seen_.end()) + if (lm_generate_) + { + auto best = sent(best_idx - n_val_, best_idx); + best.pop_back(); + try + { + for (auto& next : lm_.top_k(best, 3)) { - add(candidates, sub_cpy); - step(sub_cpy, candidates, depth + 1); + if (next.first == "") + continue; + + sentence ins_cpy{sent}; + ins_cpy.insert(best_idx, next.first, + base_penalty_ + insert_penalty_); + + if (seen_.find(ins_cpy.to_string()) == seen_.end()) + { + add(candidates, ins_cpy); + step(ins_cpy, candidates, depth + 1); + } + + sentence sub_cpy{sent}; + sub_cpy.substitute(best_idx, next.first, + base_penalty_ + substitute_penalty_); + + if (seen_.find(sub_cpy.to_string()) == seen_.end()) + { + add(candidates, sub_cpy); + step(sub_cpy, candidates, depth + 1); + } } } - } - catch (language_model_exception& ex) - { - // ignore if there are no transitions found + catch (language_model_exception& ex) + { + // ignore if there are no transitions found + } } } diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index fb03b3e5c..7183475cf 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -28,7 +28,8 @@ language_model::language_model(const cpptoml::table& config) N_ = 0; if (binary_file && filesystem::file_exists(*binary_file + "0.binlm")) { - LOG(info) << "Loading language model from binary file..." << ENDLG; + LOG(info) << "Loading language model from binary files: " + << *binary_file << "*" << ENDLG; auto time = common::time( [&]() { @@ -42,7 +43,8 @@ language_model::language_model(const cpptoml::table& config) } else if (arpa_file && binary_file) { - LOG(info) << "Loading language model from .arpa file... " << ENDLG; + LOG(info) << "Loading language model from .arpa file: " << *arpa_file + << ENDLG; prefix_ = *binary_file; auto time = common::time([&]() { diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/lm-test.cpp index 228d38c42..eb7614202 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/lm-test.cpp @@ -26,21 +26,6 @@ int main(int argc, char* argv[]) logging::set_cerr_logging(); - /* - lm::language_model model{cpptoml::parse_file(argv[1])}; - std::string line; - std::ifstream in{argv[2]}; - while (in) - { - std::getline(in, line); - if (line.empty()) - continue; - - lm::sentence sent{line}; - std::cout << model.log_prob(sent) << std::endl; - } - */ - lm::diff correcter{cpptoml::parse_file(argv[1])}; std::ifstream in{argv[2]}; auto num_sentences = filesystem::num_lines(argv[2]); From 91226131a670c6114cbc9d471432d286e0b42da0 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 21 Aug 2015 11:56:10 -0500 Subject: [PATCH 205/481] create tool to tokenize files based on POS tags --- src/sequence/crf/tools/CMakeLists.txt | 3 + src/sequence/crf/tools/pos_tokenizer.cpp | 85 ++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 src/sequence/crf/tools/pos_tokenizer.cpp diff --git a/src/sequence/crf/tools/CMakeLists.txt b/src/sequence/crf/tools/CMakeLists.txt index e17263bda..7163dcfb5 100644 --- a/src/sequence/crf/tools/CMakeLists.txt +++ b/src/sequence/crf/tools/CMakeLists.txt @@ -6,3 +6,6 @@ target_link_libraries(crf-test meta-crf meta-classify) add_executable(pos-tag pos_tag.cpp) target_link_libraries(pos-tag meta-tokenizers meta-crf) + +add_executable(pos-tokenizer pos_tokenizer.cpp) +target_link_libraries(pos-tokenizer meta-tokenizers meta-crf) diff --git a/src/sequence/crf/tools/pos_tokenizer.cpp b/src/sequence/crf/tools/pos_tokenizer.cpp new file mode 100644 index 000000000..e00da97ba --- /dev/null +++ b/src/sequence/crf/tools/pos_tokenizer.cpp @@ -0,0 +1,85 @@ +/** + * @file pos_tokenizer.cpp + * @author Sean Massung + */ + +#include +#include "sequence/crf/crf.h" +#include "sequence/crf/tagger.h" +#include "sequence/io/ptb_parser.h" +#include "sequence/sequence.h" +#include "sequence/crf/tagger.h" +#include "analyzers/tokenizers/icu_tokenizer.h" +#include "cpptoml.h" + +using namespace meta; + +int main(int argc, char* argv[]) +{ + if (argc != 2) + { + std::cerr << "Usage:\t" << argv[0] << " configFile" << std::endl; + return 1; + } + + logging::set_cerr_logging(); + + auto config = cpptoml::parse_file(argv[1]); + + auto keep_list_filename = config.get_as("function-words"); + std::unordered_set keep_list; + std::ifstream keep_list_file{*keep_list_filename}; + std::string word; + while (keep_list_file >> word) + keep_list.insert(word); + + auto crf_group = config.get_table("crf"); + if (!crf_group) + { + std::cerr << "[crf] group needed in config file" << std::endl; + return 1; + } + + auto prefix = crf_group->get_as("prefix"); + if (!prefix) + { + std::cerr << "prefix to learned model needed in [crf] group" + << std::endl; + return 1; + } + + sequence::crf crf{*prefix}; + auto ana = sequence::default_pos_analyzer(); + ana.load(*prefix); + const auto& analyzer = ana; + auto tagger = crf.make_tagger(); + + std::string line; + while (std::getline(std::cin, line)) + { + std::unique_ptr stream + = make_unique(); + stream->set_content(std::move(line)); + sequence::sequence seq; + while (*stream) + { + auto token = stream->next(); + if (token == " " || token == "" || token == "") + continue; + seq.add_observation( + {sequence::symbol_t{token}, sequence::tag_t{"[UNK]"}}); + } + + analyzer.analyze(seq); + tagger.tag(seq); + for (auto& obs : seq) + { + auto word = obs.symbol(); + if (keep_list.find(word) != keep_list.end()) + std::cout << word << " "; + else + std::cout << analyzer.tag(obs.label()) << " "; + } + std::cout << std::endl; + } +} From ccd974d7799a7b11913aa8bb2579b99853e55dcf Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 21 Aug 2015 12:36:45 -0500 Subject: [PATCH 206/481] create tool to print out the top k terms in a corpus --- src/tools/CMakeLists.txt | 3 ++ src/tools/top_k.cpp | 74 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 src/tools/top_k.cpp diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt index 800806d2b..cda480512 100644 --- a/src/tools/CMakeLists.txt +++ b/src/tools/CMakeLists.txt @@ -3,3 +3,6 @@ target_link_libraries(profile meta-index meta-greedy-tagger meta-parser ${CMAKE_THREAD_LIBS_INIT}) + +add_executable(top-k top_k.cpp) +target_link_libraries(top-k meta-index) diff --git a/src/tools/top_k.cpp b/src/tools/top_k.cpp new file mode 100644 index 000000000..fccc28065 --- /dev/null +++ b/src/tools/top_k.cpp @@ -0,0 +1,74 @@ +/** + * @file top_k.cpp + * @author Sean Massung + */ + +#include +#include +#include +#include +#include +#include +#include "cpptoml.h" +#include "corpus/corpus.h" +#include "analyzers/analyzer.h" +#include "analyzers/filters/all.h" + +using namespace meta; + +int main(int argc, char* argv[]) +{ + if (argc != 3) + { + std::cerr << "Usage: " << argv[0] << " config.toml k" << std::endl; + std::cerr << "Prints out the top k most frequent terms in the corpus " + "according to the filter chain specified in the config " + "file." << std::endl; + return 1; + } + + uint64_t k = std::stoi(argv[2]); + + auto config = cpptoml::parse_file(argv[1]); + auto group = config.get_table_array("analyzers"); + if (!group) + throw std::runtime_error{"[[analyzers]] missing from config"}; + + // only use the feature representation of the first analyzer + auto filts = analyzers::analyzer::load_filters(config, *(group->get()[0])); + + std::unordered_map counts; + auto docs = corpus::corpus::load(config); + while (docs->has_next()) + { + auto doc = docs->next(); + auto content = doc.content(); + filts->set_content(std::move(content)); + while (*filts) + ++counts[filts->next()]; + } + + using pair_t = std::pair; + auto comp = [](const pair_t& a, const pair_t& b) + { + return a.second > b.second; + }; + std::priority_queue, decltype(comp)> terms{ + comp}; + for (auto& term : counts) + { + terms.emplace(term); + if (terms.size() > k) + terms.pop(); + } + + std::vector sorted; + while (!terms.empty()) + { + sorted.emplace_back(std::move(terms.top())); + terms.pop(); + } + + for (auto it = sorted.rbegin(); it != sorted.rend(); ++it) + std::cout << it->first << "\t" << it->second << std::endl; +} From 557b205f5c3da870bd99a8ed68b198a4a4b1c64b Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 21 Aug 2015 14:05:12 -0500 Subject: [PATCH 207/481] add progress printing for top-k --- src/tools/top_k.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/tools/top_k.cpp b/src/tools/top_k.cpp index fccc28065..19f01a415 100644 --- a/src/tools/top_k.cpp +++ b/src/tools/top_k.cpp @@ -13,6 +13,8 @@ #include "corpus/corpus.h" #include "analyzers/analyzer.h" #include "analyzers/filters/all.h" +#include "util/progress.h" +#include "logging/logger.h" using namespace meta; @@ -29,6 +31,8 @@ int main(int argc, char* argv[]) uint64_t k = std::stoi(argv[2]); + logging::set_cerr_logging(); + auto config = cpptoml::parse_file(argv[1]); auto group = config.get_table_array("analyzers"); if (!group) @@ -39,14 +43,17 @@ int main(int argc, char* argv[]) std::unordered_map counts; auto docs = corpus::corpus::load(config); + printing::progress prog{" > Reading corpus: ", docs->size()}; while (docs->has_next()) { auto doc = docs->next(); + prog(doc.id()); auto content = doc.content(); filts->set_content(std::move(content)); while (*filts) ++counts[filts->next()]; } + prog.end(); using pair_t = std::pair; auto comp = [](const pair_t& a, const pair_t& b) From 0c7670c989fa4d5948f810cd4b5a3d4103168114 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 25 Aug 2015 09:48:39 -0500 Subject: [PATCH 208/481] fix bug in pos-tokenizer with empty sequence; lowercase kept tokens --- src/sequence/crf/tools/pos_tokenizer.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sequence/crf/tools/pos_tokenizer.cpp b/src/sequence/crf/tools/pos_tokenizer.cpp index e00da97ba..b613baebc 100644 --- a/src/sequence/crf/tools/pos_tokenizer.cpp +++ b/src/sequence/crf/tools/pos_tokenizer.cpp @@ -70,11 +70,15 @@ int main(int argc, char* argv[]) {sequence::symbol_t{token}, sequence::tag_t{"[UNK]"}}); } + if (seq.size() == 0) + continue; + analyzer.analyze(seq); tagger.tag(seq); for (auto& obs : seq) { auto word = obs.symbol(); + std::transform(word.begin(), word.end(), word.begin(), ::tolower); if (keep_list.find(word) != keep_list.end()) std::cout << word << " "; else From 9e7dfe4ffd4f4f60a308d5ff20e8b16b6330e41a Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 28 Aug 2015 16:30:55 -0500 Subject: [PATCH 209/481] Only force the build type to Release as root project. If we are the root project, and the user failed to specify a build type, force Release. Otherwise, it's not safe to muck with the build type (the user's includes won't have the right #defines being set), so leave it alone. --- CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b5dfae650..e983f064a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,11 @@ include(CheckCXXSourceRuns) include(ExternalProject) include(FindZLIB) -if (NOT CMAKE_BUILD_TYPE) +# Check if there is no build type set. If meta itself is the root project, +# compile it in release mode instead. If we aren't the root project, just +# continue along with whatever we would do ordinarily (they *really* should +# be specifying a build type, but...) +if (NOT CMAKE_BUILD_TYPE AND CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) message("-- No build type selected, defaulting to Release") set(CMAKE_BUILD_TYPE "Release") endif() From b9f1653be08591604b5f56bbe79a60623dedf956 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 31 Aug 2015 21:06:43 -0500 Subject: [PATCH 210/481] use utf::foldcase and ptb_normalizer in pos_tokenizer --- src/sequence/crf/tools/CMakeLists.txt | 2 +- src/sequence/crf/tools/pos_tokenizer.cpp | 22 ++++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/sequence/crf/tools/CMakeLists.txt b/src/sequence/crf/tools/CMakeLists.txt index 7163dcfb5..0e57b1bd7 100644 --- a/src/sequence/crf/tools/CMakeLists.txt +++ b/src/sequence/crf/tools/CMakeLists.txt @@ -8,4 +8,4 @@ add_executable(pos-tag pos_tag.cpp) target_link_libraries(pos-tag meta-tokenizers meta-crf) add_executable(pos-tokenizer pos_tokenizer.cpp) -target_link_libraries(pos-tokenizer meta-tokenizers meta-crf) +target_link_libraries(pos-tokenizer meta-analyzers meta-crf) diff --git a/src/sequence/crf/tools/pos_tokenizer.cpp b/src/sequence/crf/tools/pos_tokenizer.cpp index b613baebc..d25bba3a5 100644 --- a/src/sequence/crf/tools/pos_tokenizer.cpp +++ b/src/sequence/crf/tools/pos_tokenizer.cpp @@ -4,13 +4,14 @@ */ #include +#include "analyzers/filters/ptb_normalizer.h" +#include "analyzers/tokenizers/icu_tokenizer.h" +#include "cpptoml.h" #include "sequence/crf/crf.h" #include "sequence/crf/tagger.h" #include "sequence/io/ptb_parser.h" #include "sequence/sequence.h" -#include "sequence/crf/tagger.h" -#include "analyzers/tokenizers/icu_tokenizer.h" -#include "cpptoml.h" +#include "utf/utf.h" using namespace meta; @@ -18,7 +19,7 @@ int main(int argc, char* argv[]) { if (argc != 2) { - std::cerr << "Usage:\t" << argv[0] << " configFile" << std::endl; + std::cerr << "Usage:\t" << argv[0] << " config.toml" << std::endl; return 1; } @@ -59,6 +60,8 @@ int main(int argc, char* argv[]) { std::unique_ptr stream = make_unique(); + stream = make_unique( + std::move(stream)); stream->set_content(std::move(line)); sequence::sequence seq; while (*stream) @@ -73,14 +76,21 @@ int main(int argc, char* argv[]) if (seq.size() == 0) continue; + std::unordered_set ptb_special + = {"-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"}; + analyzer.analyze(seq); tagger.tag(seq); for (auto& obs : seq) { auto word = obs.symbol(); - std::transform(word.begin(), word.end(), word.begin(), ::tolower); - if (keep_list.find(word) != keep_list.end()) + if (ptb_special.find(word) != ptb_special.end()) + std::cout << word << " "; + else if (keep_list.find(word) != keep_list.end()) + { + word = utf::foldcase(word); std::cout << word << " "; + } else std::cout << analyzer.tag(obs.label()) << " "; } From 9e3ddb775b94a67228e47a660b58d2d56e8a1481 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 31 Aug 2015 21:10:09 -0500 Subject: [PATCH 211/481] rename lm-test.cpp to diff_test.cpp (since that's what it's doing) --- src/lm/tools/CMakeLists.txt | 4 ++-- src/lm/tools/{lm-test.cpp => diff_test.cpp} | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) rename src/lm/tools/{lm-test.cpp => diff_test.cpp} (97%) diff --git a/src/lm/tools/CMakeLists.txt b/src/lm/tools/CMakeLists.txt index 81c5d018c..629a57efd 100644 --- a/src/lm/tools/CMakeLists.txt +++ b/src/lm/tools/CMakeLists.txt @@ -1,5 +1,5 @@ -add_executable(lm-test lm-test.cpp) -target_link_libraries(lm-test meta-language-model meta-index) +add_executable(diff-test diff_test.cpp) +target_link_libraries(diff-test meta-language-model meta-index) add_executable(create-dataset create-dataset.cpp) target_link_libraries(create-dataset meta-language-model meta-index) diff --git a/src/lm/tools/lm-test.cpp b/src/lm/tools/diff_test.cpp similarity index 97% rename from src/lm/tools/lm-test.cpp rename to src/lm/tools/diff_test.cpp index eb7614202..550955ef8 100644 --- a/src/lm/tools/lm-test.cpp +++ b/src/lm/tools/diff_test.cpp @@ -1,5 +1,5 @@ /** - * @file lm-test.cpp + * @file diff_test.cpp * @author Sean Massung */ @@ -8,10 +8,9 @@ #include "meta.h" #include "lm/diff.h" #include "lm/sentence.h" -#include "lm/language_model.h" #include "logging/logger.h" -#include "util/progress.h" #include "util/filesystem.h" +#include "util/progress.h" using namespace meta; From 13ad594df746f20b7cc2107ab44b40c0d3215164 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 31 Aug 2015 21:11:42 -0500 Subject: [PATCH 212/481] remove commented-out edit strings from sentence.cpp --- src/lm/sentence.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 886f43260..9496f77a0 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -81,8 +81,6 @@ sentence sentence::operator()(size_type from, size_type to) const void sentence::substitute(size_type idx, const std::string& token, double weight /* = 0.0 */) { - // ops_.push_back("substitute(" + std::to_string(idx) + ", " + tokens_[idx] - // + " -> " + token + ")"); ops_.push_back("substitute(" + tokens_[idx] + " -> " + token + ")"); tokens_[idx] = token; weights_.push_back(weight); @@ -90,8 +88,6 @@ void sentence::substitute(size_type idx, const std::string& token, void sentence::remove(size_type idx, double weight /* = 0.0 */) { - // ops_.push_back("remove(" + std::to_string(idx) + ", " + (*this)[idx] + - // ")"); ops_.push_back("remove(" + (*this)[idx] + ")"); tokens_.erase(tokens_.begin() + idx); weights_.push_back(weight); @@ -101,7 +97,6 @@ void sentence::insert(size_type idx, const std::string& token, double weight /* = 0.0 */) { tokens_.insert(tokens_.begin() + idx, token); - // ops_.push_back("insert(" + std::to_string(idx) + ", " + token + ")"); ops_.push_back("insert(" + token + ")"); weights_.push_back(weight); } From 5228dca5b91aa2e4e47275e7d497ddafa1ffe5b7 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 31 Aug 2015 21:12:47 -0500 Subject: [PATCH 213/481] remove tool that diff_analyzer replaces --- src/lm/tools/CMakeLists.txt | 3 -- src/lm/tools/create-dataset.cpp | 61 --------------------------------- 2 files changed, 64 deletions(-) delete mode 100644 src/lm/tools/create-dataset.cpp diff --git a/src/lm/tools/CMakeLists.txt b/src/lm/tools/CMakeLists.txt index 629a57efd..ed492353f 100644 --- a/src/lm/tools/CMakeLists.txt +++ b/src/lm/tools/CMakeLists.txt @@ -1,5 +1,2 @@ add_executable(diff-test diff_test.cpp) target_link_libraries(diff-test meta-language-model meta-index) - -add_executable(create-dataset create-dataset.cpp) -target_link_libraries(create-dataset meta-language-model meta-index) diff --git a/src/lm/tools/create-dataset.cpp b/src/lm/tools/create-dataset.cpp deleted file mode 100644 index 35de29f99..000000000 --- a/src/lm/tools/create-dataset.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/** - * @file create-dataset.cpp - * @author Sean Massung - */ - -#include -#include -#include "meta.h" -#include "cpptoml.h" -#include "lm/diff.h" -#include "lm/sentence.h" - -using namespace meta; - -int main(int argc, char* argv[]) -{ - if (argc != 3) - { - std::cerr << "Usage:\t" << argv[0] << " config.toml input.txt" - << std::endl; - return 1; - } - - bool diagnostic = true; - auto config = cpptoml::parse_file(argv[1]); - lm::diff correcter{*config.get_table("diff")}; - std::string line; - std::ifstream in{argv[2]}; - std::ofstream out{"edits.dat"}; - while (in) - { - std::getline(in, line); - if (line.empty()) - continue; - try - { - if (diagnostic) - { - out << std::endl; - out << line << std::endl; - } - lm::sentence sent{line}; - auto candidates = correcter.candidates(sent, true); - auto edits = candidates[0].first.operations(); - if (edits.empty()) - out << "unmodified" << std::endl; - else - { - for (auto& e : edits) - out << e << " "; - out << std::endl; - } - if (diagnostic) - out << candidates[0].first.to_string() << std::endl; - } - catch (lm::sentence_exception& ex) - { - out << "error" << std::endl; - } - } -} From 1884cb1ec0b1c133bdef96b37c978f53ba2503c2 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 3 Sep 2015 15:46:51 -0500 Subject: [PATCH 214/481] Switch porter2_stemmer origin back to smassung/porter2_stemmer master. --- .gitmodules | 4 ++-- deps/porter2_stemmer | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index 2ec784771..bf53e8358 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,8 +4,8 @@ branch = master [submodule "deps/porter2_stemmer"] path = deps/porter2_stemmer - url = https://github.com/meta-toolkit/porter2_stemmer.git - branch = meta + url = https://github.com/smassung/porter2_stemmer.git + branch = master [submodule "deps/libsvm-modules"] path = deps/libsvm-modules url = https://github.com/meta-toolkit/meta-libsvm.git diff --git a/deps/porter2_stemmer b/deps/porter2_stemmer index d6acc0678..ca29419b2 160000 --- a/deps/porter2_stemmer +++ b/deps/porter2_stemmer @@ -1 +1 @@ -Subproject commit d6acc06781f1db171c0264bb4ef619a41f06c077 +Subproject commit ca29419b2810f39391a8260bd3f1ff862ace9764 From 91dbc19c7a586a19069fb156d47b4743057928a6 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 4 Sep 2015 00:41:45 -0500 Subject: [PATCH 215/481] Allow for creating a forward_index directly. This will let us have classification-specific analyzers that produce floating-point weighted features, which wouldn't be natively supported via uninverting an inverted index. This change also makes it (slightly) more clear what the postings format supports for each index type (postings_data_type for inverted_index has uint64_t counts, whereas postings_data_type for forward_index has double counts). This changes the postings format every so slightly, so you will need to regenerate any forward indexes you have. --- include/index/chunk_handler.tcc | 330 ------------------ include/index/chunk_reader.h | 243 +++++++++++++ include/index/forward_index.h | 8 +- include/index/inverted_index.h | 6 +- include/index/postings_buffer.h | 8 +- include/index/postings_data.h | 43 ++- include/index/postings_data.tcc | 129 ++++--- include/index/postings_file.h | 9 +- include/index/postings_file_writer.h | 5 +- .../{chunk_handler.h => postings_inverter.h} | 20 +- include/index/postings_inverter.tcc | 200 +++++++++++ include/index/postings_stream.h | 7 +- include/util/probe_set.h | 23 +- src/index/forward_index.cpp | 288 +++++++++++++-- src/index/inverted_index.cpp | 32 +- src/test/forward_index_test.cpp | 19 +- 16 files changed, 873 insertions(+), 497 deletions(-) delete mode 100644 include/index/chunk_handler.tcc create mode 100644 include/index/chunk_reader.h rename include/index/{chunk_handler.h => postings_inverter.h} (88%) create mode 100644 include/index/postings_inverter.tcc diff --git a/include/index/chunk_handler.tcc b/include/index/chunk_handler.tcc deleted file mode 100644 index dc08ce1e2..000000000 --- a/include/index/chunk_handler.tcc +++ /dev/null @@ -1,330 +0,0 @@ -/** - * @file chunk_handler.tcc - * @author Chase Geigle - */ - -#include -#include - -#include "index/chunk_handler.h" -#include "index/disk_index.h" -#include "parallel/thread_pool.h" - -namespace meta -{ -namespace index -{ - -template -chunk_handler::producer::producer(chunk_handler* parent, - uint64_t ram_budget) - : max_size_{ram_budget}, parent_{parent} -{ - chunk_size_ = pdata_.bytes_used(); - assert(chunk_size_ < max_size_); -} - -template -template -void chunk_handler::producer::operator()(const secondary_key_type& key, - const Container& counts) -{ - for (const auto& count : counts) - { - postings_buffer_type pb{count.first}; - auto it = pdata_.find(pb); - if (it == pdata_.end()) - { - // check if we would resize on an insert - const auto& max_load_factor = pdata_.max_load_factor(); - if (max_load_factor.denominator * (pdata_.size() + 1) - >= max_load_factor.numerator * pdata_.capacity()) - { - // now check if roughly doubling our bytes used is going to - // cause problems - auto next_chunk_size = chunk_size_ + pdata_.bytes_used() - + pdata_.bytes_used() / 2; - if (next_chunk_size >= max_size_) - { - // if so, flush the current chunk before carrying on - flush_chunk(); - } - } - - chunk_size_ -= pdata_.bytes_used(); - - pb.write_count(key, static_cast(count.second)); - chunk_size_ += pb.bytes_used(); - pdata_.emplace(std::move(pb)); - - chunk_size_ += pdata_.bytes_used(); - } - else - { - chunk_size_ -= it->bytes_used(); - - // note: we can modify elements in this set because we do not change - // how comparisons are made (the primary_key value) - const_cast(*it) - .write_count(key, static_cast(count.second)); - - chunk_size_ += it->bytes_used(); - } - - if (chunk_size_ >= max_size_) - flush_chunk(); - } -} - -template -void chunk_handler::producer::flush_chunk() -{ - if (pdata_.empty()) - return; - - // extract the keys, emptying the hash set - auto pdata = pdata_.extract_keys(); - std::sort(pdata.begin(), pdata.end()); - parent_->write_chunk(pdata); - - chunk_size_ = pdata_.bytes_used(); - - // if the table itself is beyond the maximum chunk size, start over - // (this should rarely, if ever, happen) - if (chunk_size_ > max_size_) - { - decltype(pdata_){}.swap(pdata_); - chunk_size_ = pdata_.bytes_used(); - } -} - -template -chunk_handler::producer::~producer() -{ - flush_chunk(); -} - -template -chunk_handler::chunk_handler(const std::string& prefix) - : prefix_{prefix} -{ - // nothing -} - -template -auto chunk_handler::make_producer(uint64_t ram_budget) -> producer -{ - return {this, ram_budget}; -} - -template -void chunk_handler::write_chunk(std::vector& pdata) -{ - auto chunk_num = chunk_num_.fetch_add(1); - - util::optional top; - { - std::lock_guard lock{mutables_}; - if (!chunks_.empty()) - { - top = chunks_.top(); - chunks_.pop(); - } - } - - if (!top) // pqueue was empty - { - std::string chunk_name = prefix_ + "/chunk-" - + std::to_string(chunk_num); - { - std::ofstream outfile{chunk_name, std::ios::binary}; - for (auto& p : pdata) - p.write_packed(outfile); - } - pdata.clear(); - - std::lock_guard lock{mutables_}; - chunks_.emplace(chunk_name); - } - else // we can merge with an existing chunk - { - top->memory_merge_with(pdata); - - std::lock_guard lock{mutables_}; - chunks_.emplace(*top); - } -} - -namespace detail -{ -/** - * Represents an on-disk chunk to be merged with multi-way merge sort. Each - * input_chunk stores the file it's reading from, the total bytes needed to - * be read, and the current number of bytes read, as well as buffers in one - * postings. - */ -template -struct input_chunk -{ - std::unique_ptr file; - std::string path; - typename Index::index_pdata_type postings; - uint64_t total_bytes; - uint64_t bytes_read; - - input_chunk(const std::string& filename) - : file{make_unique(filename, std::ios::binary)}, - path{filename}, - total_bytes{filesystem::file_size(path)}, - bytes_read{0} - { - ++(*this); - } - - ~input_chunk() - { - if (file) - { - file = nullptr; - filesystem::delete_file(path); - } - } - - input_chunk(input_chunk&&) = default; - - input_chunk& operator=(input_chunk&& rhs) - { - if (file) - { - file = nullptr; - filesystem::delete_file(path); - } - - file = std::move(rhs.file); - path = std::move(rhs.path); - postings = std::move(rhs.postings); - total_bytes = rhs.total_bytes; - bytes_read = rhs.bytes_read; - - return *this; - } - - operator bool() const - { - return static_cast(*file); - } - - bool operator<(const input_chunk& other) const - { - return postings < other.postings; - } - - void operator++() - { - bytes_read += postings.read_packed(*file); - } -}; -} - -template -void chunk_handler::merge_chunks() -{ - using input_chunk = detail::input_chunk; - std::vector to_merge; - to_merge.reserve(chunks_.size()); - while (!chunks_.empty()) - { - to_merge.emplace_back(chunks_.top().path()); - chunks_.pop(); - } - - printing::progress progress{ - " > Merging postings: ", - std::accumulate(to_merge.begin(), to_merge.end(), 0ul, - [](uint64_t acc, const input_chunk& chunk) - { - return acc + chunk.total_bytes; - })}; - std::ofstream outfile{prefix_ + "/postings.index", std::ios::binary}; - unique_primary_keys_ = 0; - - uint64_t total_read - = std::accumulate(to_merge.begin(), to_merge.end(), 0ul, - [](uint64_t acc, const input_chunk& chunk) - { - return acc + chunk.bytes_read; - }); - while (!to_merge.empty()) - { - progress(total_read); - ++(*unique_primary_keys_); - - std::sort(to_merge.begin(), to_merge.end()); - - // gather all postings that match the smallest primary key, reading - // a new postings from the corresponding file - auto range = std::equal_range(to_merge.begin(), to_merge.end(), - *to_merge.begin()); - auto min_pk = range.first->postings.primary_key(); - - using count_t = typename index_pdata_type::count_t; - std::vector to_write; - to_write.reserve(std::distance(range.first, range.second)); - std::for_each(range.first, range.second, [&](input_chunk& chunk) - { - to_write.emplace_back(chunk.postings.counts()); - auto before = chunk.bytes_read; - ++chunk; - total_read += (chunk.bytes_read - before); - }); - - // merge them all into one big counts vector - count_t counts; - std::for_each(to_write.begin(), to_write.end(), [&](count_t& pd) - { - std::move(pd.begin(), pd.end(), - std::back_inserter(counts)); - count_t{}.swap(pd); - }); - - // write out the merged counts - index_pdata_type output{std::move(min_pk)}; - output.set_counts(counts); - count_t{}.swap(counts); - output.write_packed(outfile); - - // remove all empty chunks from the input - to_merge.erase(std::remove_if(to_merge.begin(), to_merge.end(), - [](const input_chunk& chunk) - { - return !chunk; - }), - to_merge.end()); - } -} - -template -uint64_t chunk_handler::unique_primary_keys() const -{ - if (!unique_primary_keys_) - throw chunk_handler_exception{ - "merge has not been called before requesting unique primary keys"}; - return *unique_primary_keys_; -} - -template -uint64_t chunk_handler::final_size() const -{ - if (!chunks_.empty()) - throw chunk_handler_exception{ - "merge not complete before final_size() called"}; - return filesystem::file_size(prefix_ + "/postings.index"); -} - -template -uint32_t chunk_handler::size() const -{ - return chunk_num_.load(); -} -} -} diff --git a/include/index/chunk_reader.h b/include/index/chunk_reader.h new file mode 100644 index 000000000..46abf5e0d --- /dev/null +++ b/include/index/chunk_reader.h @@ -0,0 +1,243 @@ +/** + * @file chunk_reader.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_INDEX_CHUNK_READER_H_ +#define META_INDEX_CHUNK_READER_H_ + +#include +#include +#include +#include +#include + +#include "util/filesystem.h" +#include "util/progress.h" +#include "util/shim.h" + +namespace meta +{ +namespace index +{ + +/** + * Represents an on-disk chunk to be merged with multi-way merge sort. Each + * chunk_reader stores the file it's reading from, the total bytes needed + * to be read, and the current number of bytes read, as well as buffers in + * one postings. + */ +template +class chunk_reader +{ + private: + /// the file we're reading from currently, or null if there is none + std::unique_ptr file_; + /// the path to the file we're reading from + std::string path_; + /// the current buffered postings data + PostingsData postings_; + /// the total number of bytes in the chunk we're reading + uint64_t total_bytes_; + /// the total number of bytes read + uint64_t bytes_read_; + + public: + /** + * Constructs a new chunk reader from the given chunk path. + * @param filename The path to the chunk to be read + */ + chunk_reader(const std::string& filename) + : file_{make_unique(filename, std::ios::binary)}, + path_{filename}, + total_bytes_{filesystem::file_size(path_)}, + bytes_read_{0} + { + ++(*this); + } + + /** + * Destroys the reader **and the chunk file it was reading from**. + */ + ~chunk_reader() + { + if (file_) + { + file_ = nullptr; + filesystem::delete_file(path_); + } + } + + /** + * chunk_reader can be move constructed. + */ + chunk_reader(chunk_reader&&) = default; + + /** + * chunk_reader can be move assigned. + * @param rhs the right hand side of the assignment + */ + chunk_reader& operator=(chunk_reader&& rhs) + { + if (file_) + { + file_ = nullptr; + filesystem::delete_file(path_); + } + + file_ = std::move(rhs.file_); + path_ = std::move(rhs.path_); + postings_ = std::move(rhs.postings_); + total_bytes_ = rhs.total_bytes_; + bytes_read_ = rhs.bytes_read_; + + return *this; + } + + /** + * Whether or not the chunk_reader is in a good state. + * @return whether the underlying stream is in a good state + */ + operator bool() const + { + return static_cast(*file_); + } + + /** + * Comparison operator for sorting. + * @param other the other reader to compare with + * @return whether the current reader's postings's primary_key is less + * than other's + */ + bool operator<(const chunk_reader& other) const + { + return postings_ < other.postings_; + } + + /** + * Reads the next postings data from the stream. + */ + void operator++() + { + bytes_read_ += postings_.read_packed(*file_); + } + + /** + * @return the total number of bytes read so far + */ + uint64_t bytes_read() const + { + return bytes_read_; + } + + /** + * @return the total number of bytes in the chunk file + */ + uint64_t total_bytes() const + { + return total_bytes_; + } + + /** + * @return the current buffered postings object + */ + const PostingsData& postings() const + { + return postings_; + } +}; + +/** + * Performs a multi-way merge sort of all of the provided chunks, writing + * to the provided output stream. Currently, this function will attempt + * to open std::distance(begin, end) number of files and merge them all + * simultaneously but this could change in future implementations. + * + * @param outstream Where the merged chunks should be written + * @param begin An iterator to the beginning of the sequence containing + * the chunk paths + * @param end An iterator to the end of the sequence containing the chunk + * paths + * @return the total number of unique primary keys found during the merging + */ +template +uint64_t multiway_merge(std::ostream& outstream, ForwardIterator begin, + ForwardIterator end) +{ + using input_chunk = chunk_reader; + std::vector to_merge; + to_merge.reserve(std::distance(begin, end)); + for (; begin != end; ++begin) + to_merge.emplace_back(*begin); + + printing::progress progress{ + " > Merging postings: ", + std::accumulate(to_merge.begin(), to_merge.end(), 0ul, + [](uint64_t acc, const input_chunk& chunk) + { + return acc + chunk.total_bytes(); + })}; + + uint64_t unique_primary_keys = 0; + + uint64_t total_read + = std::accumulate(to_merge.begin(), to_merge.end(), 0ul, + [](uint64_t acc, const input_chunk& chunk) + { + return acc + chunk.bytes_read(); + }); + while (!to_merge.empty()) + { + progress(total_read); + ++unique_primary_keys; + + std::sort(to_merge.begin(), to_merge.end()); + + // gather all postings that match the smallest primary key, reading + // a new postings from the corresponding file + auto range = std::equal_range(to_merge.begin(), to_merge.end(), + *to_merge.begin()); + auto min_pk = range.first->postings().primary_key(); + + using count_t = typename PostingsData::count_t; + std::vector to_write; + to_write.reserve(std::distance(range.first, range.second)); + std::for_each(range.first, range.second, [&](input_chunk& chunk) + { + to_write.emplace_back(chunk.postings().counts()); + auto before = chunk.bytes_read(); + ++chunk; + total_read += (chunk.bytes_read() - before); + }); + + // merge them all into one big counts vector + count_t counts; + std::for_each(to_write.begin(), to_write.end(), [&](count_t& pd) + { + std::move(pd.begin(), pd.end(), + std::back_inserter(counts)); + count_t{}.swap(pd); + }); + + // write out the merged counts + PostingsData output{std::move(min_pk)}; + output.set_counts(std::move(counts)); + output.write_packed(outstream); + + // remove all empty chunks from the input + to_merge.erase(std::remove_if(to_merge.begin(), to_merge.end(), + [](const input_chunk& chunk) + { + return !chunk; + }), + to_merge.end()); + } + return unique_primary_keys; +} +} +} +#endif diff --git a/include/index/forward_index.h b/include/index/forward_index.h index 548fb3e0f..4d1b33340 100644 --- a/include/index/forward_index.h +++ b/include/index/forward_index.h @@ -28,7 +28,7 @@ class corpus; namespace index { -template +template class postings_data; } } @@ -73,9 +73,9 @@ class forward_index : public disk_index using primary_key_type = doc_id; using secondary_key_type = term_id; - using postings_data_type = postings_data; - using inverted_pdata_type = postings_data; - using index_pdata_type = postings_data_type; + using postings_data_type = postings_data; + using inverted_pdata_type = postings_data; + using index_pdata_type = postings_data; using exception = forward_index_exception; protected: diff --git a/include/index/inverted_index.h b/include/index/inverted_index.h index 6767b32e8..d8596fe0f 100644 --- a/include/index/inverted_index.h +++ b/include/index/inverted_index.h @@ -33,7 +33,7 @@ namespace index template class chunk_handler; -template +template class postings_data; } } @@ -66,8 +66,8 @@ class inverted_index : public disk_index using primary_key_type = term_id; using secondary_key_type = doc_id; - using postings_data_type = postings_data; - using index_pdata_type = postings_data; + using postings_data_type = postings_data; + using index_pdata_type = postings_data; using exception = inverted_index_exception; /** diff --git a/include/index/postings_buffer.h b/include/index/postings_buffer.h index f6c4a79fd..28e3c4461 100644 --- a/include/index/postings_buffer.h +++ b/include/index/postings_buffer.h @@ -59,7 +59,7 @@ uint64_t bytes_used( * allows us to store significantly larger in-memory chunks than if we were * to store the full materialized postings_data. */ -template +template class postings_buffer { private: @@ -106,11 +106,10 @@ class postings_buffer * @param id The SecondaryKey for the pair * @param count The count value associated with the id */ - template void write_count(SecondaryKey id, FeatureValue count) { ++num_ids_; - total_counts_ += static_cast(count); + total_counts_ += count; assert(id >= last_id_); io::packed::write(buffer_, id - last_id_); @@ -155,7 +154,6 @@ class postings_buffer /** * @return a postings_stream to iterate over the byte buffer */ - template postings_stream stream() const { return {reinterpret_cast(buffer_.bytes_.get()), num_ids_, @@ -292,7 +290,7 @@ class postings_buffer /// The total number of ids we've written uint64_t num_ids_ = 0; /// The sum of the counts we've written - uint64_t total_counts_ = 0; + FeatureValue total_counts_ = 0; }; } } diff --git a/include/index/postings_data.h b/include/index/postings_data.h index cfecd6891..94e436a56 100644 --- a/include/index/postings_data.h +++ b/include/index/postings_data.h @@ -33,13 +33,13 @@ namespace index * For example, for an inverted index, PrimaryKey = term_id, SecondaryKey = * doc_id. For a forward_index, PrimaryKey = doc_id, SecondaryKey = term_id. */ -template +template class postings_data { public: using primary_key_type = PrimaryKey; using secondary_key_type = SecondaryKey; - using pair_t = std::pair; + using pair_t = std::pair; using count_t = std::vector; /** @@ -52,14 +52,6 @@ class postings_data && (util::is_numeric::value), "primary and secondary keys in postings data must be numeric types"); - /** - * uint64_t and double must take up the same number of bytes since they are - * being casted to each other when compressing. - */ - static_assert(sizeof(uint64_t) == sizeof(double), - "sizeof(uint64_t) must equal sizeof(double) since " - "reinterpret_cast is used in postings_data"); - /** * postings_data is default-constructable. */ @@ -103,14 +95,14 @@ class postings_data * @param amount The number of times to increase the count for a given * SecondaryKey */ - void increase_count(SecondaryKey s_id, double amount); + void increase_count(SecondaryKey s_id, FeatureValue amount); /** * @param s_id The SecondaryKey id to query * @return the number of times SecondaryKey occurred in this * postings_data */ - double count(SecondaryKey s_id) const; + FeatureValue count(SecondaryKey s_id) const; /** * @return the per-SecondaryKey frequency information for this @@ -123,6 +115,11 @@ class postings_data */ void set_counts(const count_t& counts); + /** + * @param counts A vector of counts to assign into this postings_data + */ + void set_counts(count_t&& counts); + /** * @param begin The beginning of the counts to assign into this * postings_data @@ -144,7 +141,6 @@ class postings_data * @param out The stream to write to * @return the number of bytes used to write out this postings data */ - template uint64_t write_packed(std::ostream& out) const; /** @@ -154,7 +150,6 @@ class postings_data * @return the number of bytes used to write out this postings data's * counts */ - template uint64_t write_packed_counts(std::ostream& out) const; /** @@ -163,7 +158,6 @@ class postings_data * @param in The stream to read from * @return the number of bytes read in consuming this postings data */ - template uint64_t read_packed(std::istream& in); /** @@ -191,7 +185,7 @@ class postings_data PrimaryKey p_id_; /// The (secondary_key_type, count) pairs - util::sparse_vector counts_; + util::sparse_vector counts_; }; /** @@ -200,21 +194,24 @@ class postings_data * @return whether this postings_data has the same PrimaryKey as * the paramter */ -template -bool operator==(const postings_data& lhs, - const postings_data& rhs); +template +bool operator==( + const postings_data& lhs, + const postings_data& rhs); } } namespace std { -template /** - * Hash specialization for postings_data + * Hash specialization for postings_data */ -struct hash> +template +struct hash> { - using pdata_t = meta::index::postings_data; + using pdata_t + = meta::index::postings_data; /** * @param pd The postings_data to hash * @return the hash of the given postings_data diff --git a/include/index/postings_data.tcc b/include/index/postings_data.tcc index 80076cc56..3a75ae8a5 100644 --- a/include/index/postings_data.tcc +++ b/include/index/postings_data.tcc @@ -14,15 +14,18 @@ namespace meta namespace index { -template -postings_data::postings_data(PrimaryKey p_id) +template +postings_data::postings_data( + PrimaryKey p_id) : p_id_{p_id} -{ /* nothing */ +{ + // nothing } -template +template template -void postings_data::merge_with(Container&& cont) +void postings_data::merge_with( + Container&& cont) { auto searcher = [](const pair_t& p, const SecondaryKey& s) { @@ -54,95 +57,106 @@ void postings_data::merge_with(Container&& cont) } } -template -void postings_data::increase_count(SecondaryKey s_id, - double amount) +template +void postings_data::increase_count( + SecondaryKey s_id, FeatureValue amount) { counts_[s_id] += amount; } -template -double postings_data::count(SecondaryKey s_id) const +template +FeatureValue postings_data::count( + SecondaryKey s_id) const { return counts_.at(s_id); } -template -const std::vector>& - postings_data::counts() const +template +auto postings_data::counts() const + -> const count_t & { return counts_.contents(); } -template -void postings_data::set_counts(const count_t& counts) +template +void postings_data::set_counts( + const count_t& counts) { // no sort needed: sparse_vector::contents() sorts the parameter counts_.contents(counts); } -template +template +void postings_data::set_counts( + count_t&& counts) +{ + // no sort needed: sparse_vector::contents() sorts the parameter + counts_.contents(std::move(counts)); +} + +template template -void postings_data::set_counts(InputIterator begin, - InputIterator end) +void postings_data::set_counts( + InputIterator begin, InputIterator end) { for (; begin != end; ++begin) counts_.emplace_back(*begin); counts_.shrink_to_fit(); } -template -void postings_data::set_primary_key( +template +void postings_data::set_primary_key( PrimaryKey new_key) { p_id_ = new_key; } -template -bool postings_data:: +template +bool postings_data:: operator<(const postings_data& other) const { return primary_key() < other.primary_key(); } -template -bool operator==(const postings_data& lhs, - const postings_data& rhs) +template +bool + operator==(const postings_data& lhs, + const postings_data& rhs) { return lhs.primary_key() == rhs.primary_key(); } -template -const PrimaryKey& postings_data::primary_key() const +template +const PrimaryKey& + postings_data::primary_key() const { return p_id_; } -template -template -uint64_t postings_data::write_packed( +template +uint64_t postings_data::write_packed( std::ostream& out) const { uint64_t bytes = 0; bytes += io::packed::write(out, p_id_); - bytes += write_packed_counts(out); + bytes += write_packed_counts(out); return bytes; } -template -template -uint64_t postings_data::write_packed_counts( - std::ostream& out) const +template +uint64_t + postings_data::write_packed_counts( + std::ostream& out) const { auto bytes = io::packed::write(out, counts_.size()); auto total_counts - = std::accumulate(counts_.begin(), counts_.end(), uint64_t{0}, - [](uint64_t cur, const pair_t& pr) + = std::accumulate(counts_.begin(), counts_.end(), FeatureValue{0}, + [](FeatureValue cur, const pair_t& pr) { - return cur + static_cast(pr.second); + return cur + pr.second; }); bytes += io::packed::write(out, total_counts); @@ -150,17 +164,7 @@ uint64_t postings_data::write_packed_counts( for (const auto& count : counts_) { bytes += io::packed::write(out, count.first - last_id); - - if (std::is_same::value) - { - bytes - += io::packed::write(out, static_cast(count.second)); - } - else - { - bytes += io::packed::write(out, count.second); - } - + bytes += io::packed::write(out, count.second); last_id = count.first; } @@ -186,9 +190,9 @@ uint64_t length(const T& elem, } } -template -template -uint64_t postings_data::read_packed(std::istream& in) +template +uint64_t postings_data::read_packed( + std::istream& in) { if (in.get() == EOF) return 0; @@ -198,7 +202,7 @@ uint64_t postings_data::read_packed(std::istream& in) auto bytes = io::packed::read(in, p_id_); uint64_t size; - uint64_t total_counts; + FeatureValue total_counts; bytes += io::packed::read(in, size); bytes += io::packed::read(in, total_counts); @@ -214,26 +218,17 @@ uint64_t postings_data::read_packed(std::istream& in) bytes += io::packed::read(in, gap); id += gap; - double count; - if (std::is_same::value) - { - uint64_t next; - bytes += io::packed::read(in, next); - count = static_cast(next); - } - else - { - bytes += io::packed::read(in, count); - } - + FeatureValue count; + bytes += io::packed::read(in, count); counts_.emplace_back(id, count); } return bytes; } -template -uint64_t postings_data::bytes_used() const +template +uint64_t + postings_data::bytes_used() const { return sizeof(pair_t) * counts_.capacity() + length(p_id_) + sizeof(count_t); diff --git a/include/index/postings_file.h b/include/index/postings_file.h index 3a0e068f3..0fe8dffd6 100644 --- a/include/index/postings_file.h +++ b/include/index/postings_file.h @@ -26,11 +26,12 @@ namespace index * list is indexed via PrimaryKey and consists of pairs of (SecondaryKey, * double). */ -template +template class postings_file { public: - using postings_data_type = postings_data; + using postings_data_type + = postings_data; /** * Opens a postings file. @@ -48,7 +49,6 @@ class postings_file * @return a postings stream for this primary key, if it is in the * postings file */ - template util::optional> find_stream(PrimaryKey pk) const { @@ -64,7 +64,6 @@ class postings_file * @return a shared pointer to the postings data extracted from the * file */ - template std::shared_ptr find(PrimaryKey pk) const { auto pdata = std::make_shared(pk); @@ -73,7 +72,7 @@ class postings_file // if we are in-bounds of the postings file, populate counts if (idx < byte_locations_.size()) { - auto stream = find_stream(pk); + auto stream = find_stream(pk); pdata->set_counts(stream->begin(), stream->end()); } diff --git a/include/index/postings_file_writer.h b/include/index/postings_file_writer.h index 016a592ee..38c952a30 100644 --- a/include/index/postings_file_writer.h +++ b/include/index/postings_file_writer.h @@ -20,6 +20,7 @@ namespace meta namespace index { +template class postings_file_writer { public: @@ -38,13 +39,13 @@ class postings_file_writer /** * Writes a postings data object to the file. + * * @param pdata The postings_data to be written */ - template void write(const PostingsData& pdata) { byte_locations_[id_] = byte_pos_; - byte_pos_ += pdata.template write_packed_counts(output_); + byte_pos_ += pdata.write_packed_counts(output_); ++id_; } diff --git a/include/index/chunk_handler.h b/include/index/postings_inverter.h similarity index 88% rename from include/index/chunk_handler.h rename to include/index/postings_inverter.h index 4bb956d64..1855045ee 100644 --- a/include/index/chunk_handler.h +++ b/include/index/postings_inverter.h @@ -1,5 +1,5 @@ /** - * @file chunk_handler.h + * @file posting_inverter.h * @author Chase Geigle * * All files in META are dual-licensed under the MIT and NCSA licenses. For more @@ -33,7 +33,7 @@ namespace index * disk_index. */ template -class chunk_handler +class postings_inverter { public: using index_pdata_type = typename Index::index_pdata_type; @@ -55,7 +55,7 @@ class chunk_handler * @param ram_budget The **estimated** allowed size of the buffer * for this producer */ - producer(chunk_handler* parent, uint64_t ram_budget); + producer(postings_inverter* parent, uint64_t ram_budget); /** * Handler for when a given secondary_key has been processed and is @@ -92,17 +92,17 @@ class chunk_handler uint64_t max_size_; /// Back-pointer to the handler this producer is operating on - chunk_handler* parent_; + postings_inverter* parent_; }; /** - * Constructs a chunk_handler that writes to the given prefix. + * Constructs a postings_inverter that writes to the given prefix. * @param prefix The prefix for all chunks to be written */ - chunk_handler(const std::string& prefix); + postings_inverter(const std::string& prefix); /** - * Creates a producer for this chunk_handler. Producers are designed to + * Creates a producer for this postings_inverter. Producers are designed to * be thread-local buffers of chunks that write to disk when their * buffer is full. * @param ram_bugdet The estimated allowed size of this thread-local @@ -133,9 +133,9 @@ class chunk_handler uint64_t unique_primary_keys() const; /** - * Simple exception class for chunk_handler interactions + * Simple exception class for postings_inverter interactions */ - class chunk_handler_exception : public std::runtime_error + class postings_inverter_exception : public std::runtime_error { using std::runtime_error::runtime_error; }; @@ -165,5 +165,5 @@ class chunk_handler } } -#include "index/chunk_handler.tcc" +#include "index/postings_inverter.tcc" #endif diff --git a/include/index/postings_inverter.tcc b/include/index/postings_inverter.tcc new file mode 100644 index 000000000..c6dc2606d --- /dev/null +++ b/include/index/postings_inverter.tcc @@ -0,0 +1,200 @@ +/** + * @file postings_inverter.tcc + * @author Chase Geigle + */ + +#include +#include + +#include "index/chunk_reader.h" +#include "index/postings_inverter.h" +#include "index/disk_index.h" +#include "parallel/thread_pool.h" + +namespace meta +{ +namespace index +{ + +template +postings_inverter::producer::producer(postings_inverter* parent, + uint64_t ram_budget) + : max_size_{ram_budget}, parent_{parent} +{ + chunk_size_ = pdata_.bytes_used(); + assert(chunk_size_ < max_size_); +} + +template +template +void postings_inverter::producer:: + operator()(const secondary_key_type& key, const Container& counts) +{ + for (const auto& count : counts) + { + postings_buffer_type pb{count.first}; + auto it = pdata_.find(pb); + if (it == pdata_.end()) + { + // check if we would resize on an insert + const auto& max_load_factor = pdata_.max_load_factor(); + if (max_load_factor.denominator * (pdata_.size() + 1) + >= max_load_factor.numerator * pdata_.capacity()) + { + // now check if roughly doubling our bytes used is going to + // cause problems + auto next_chunk_size = chunk_size_ + pdata_.bytes_used() + + pdata_.bytes_used() / 2; + if (next_chunk_size >= max_size_) + { + // if so, flush the current chunk before carrying on + flush_chunk(); + } + } + + chunk_size_ -= pdata_.bytes_used(); + + pb.write_count(key, static_cast(count.second)); + chunk_size_ += pb.bytes_used(); + pdata_.emplace(std::move(pb)); + + chunk_size_ += pdata_.bytes_used(); + } + else + { + chunk_size_ -= it->bytes_used(); + + // note: we can modify elements in this set because we do not change + // how comparisons are made (the primary_key value) + const_cast(*it) + .write_count(key, static_cast(count.second)); + + chunk_size_ += it->bytes_used(); + } + + if (chunk_size_ >= max_size_) + flush_chunk(); + } +} + +template +void postings_inverter::producer::flush_chunk() +{ + if (pdata_.empty()) + return; + + // extract the keys, emptying the hash set + auto pdata = pdata_.extract_keys(); + std::sort(pdata.begin(), pdata.end()); + parent_->write_chunk(pdata); + + chunk_size_ = pdata_.bytes_used(); + + // if the table itself is beyond the maximum chunk size, start over + // (this should rarely, if ever, happen) + if (chunk_size_ > max_size_) + { + decltype(pdata_){}.swap(pdata_); + chunk_size_ = pdata_.bytes_used(); + } +} + +template +postings_inverter::producer::~producer() +{ + flush_chunk(); +} + +template +postings_inverter::postings_inverter(const std::string& prefix) + : prefix_{prefix} +{ + // nothing +} + +template +auto postings_inverter::make_producer(uint64_t ram_budget) -> producer +{ + return {this, ram_budget}; +} + +template +void postings_inverter::write_chunk( + std::vector& pdata) +{ + auto chunk_num = chunk_num_.fetch_add(1); + + util::optional top; + { + std::lock_guard lock{mutables_}; + if (!chunks_.empty()) + { + top = chunks_.top(); + chunks_.pop(); + } + } + + if (!top) // pqueue was empty + { + std::string chunk_name = prefix_ + "/chunk-" + + std::to_string(chunk_num); + { + std::ofstream outfile{chunk_name, std::ios::binary}; + for (auto& p : pdata) + p.write_packed(outfile); + } + pdata.clear(); + + std::lock_guard lock{mutables_}; + chunks_.emplace(chunk_name); + } + else // we can merge with an existing chunk + { + top->memory_merge_with(pdata); + + std::lock_guard lock{mutables_}; + chunks_.emplace(*top); + } +} + +template +void postings_inverter::merge_chunks() +{ + std::vector to_merge; + to_merge.reserve(chunks_.size()); + while (!chunks_.empty()) + { + to_merge.emplace_back(chunks_.top().path()); + chunks_.pop(); + } + + std::ofstream outfile{prefix_ + "/postings.index", std::ios::binary}; + unique_primary_keys_ = multiway_merge( + outfile, to_merge.begin(), to_merge.end()); +} + +template +uint64_t postings_inverter::unique_primary_keys() const +{ + if (!unique_primary_keys_) + throw postings_inverter_exception{ + "merge has not been called before requesting unique primary keys"}; + return *unique_primary_keys_; +} + +template +uint64_t postings_inverter::final_size() const +{ + if (!chunks_.empty()) + throw postings_inverter_exception{ + "merge not complete before final_size() called"}; + return filesystem::file_size(prefix_ + "/postings.index"); +} + +template +uint32_t postings_inverter::size() const +{ + return chunk_num_.load(); +} +} +} diff --git a/include/index/postings_stream.h b/include/index/postings_stream.h index 6a7189a49..34a966484 100644 --- a/include/index/postings_stream.h +++ b/include/index/postings_stream.h @@ -69,7 +69,8 @@ class postings_stream * postings, since the size and total counts are provided on * construction. */ - postings_stream(const char* buffer, uint64_t size, uint64_t total_counts) + postings_stream(const char* buffer, uint64_t size, + FeatureValue total_counts) : start_{buffer}, size_{size}, total_counts_{total_counts} { // nothing @@ -87,7 +88,7 @@ class postings_stream * @return the total sum of the counts for SecondaryKeys in this * postings list. */ - uint64_t total_counts() const + FeatureValue total_counts() const { return total_counts_; } @@ -223,7 +224,7 @@ class postings_stream private: const char* start_; uint64_t size_; - uint64_t total_counts_; + FeatureValue total_counts_; }; } } diff --git a/include/util/probe_set.h b/include/util/probe_set.h index b690cc09c..617a062cb 100644 --- a/include/util/probe_set.h +++ b/include/util/probe_set.h @@ -147,6 +147,14 @@ class probe_set return !(*this == rhs); } + /** + * @return the index of this key in the keys array + */ + std::size_t index() const + { + return parent_->table_[idx_]; + } + private: /** * The private constructor used by the probe_set to create its @@ -190,8 +198,9 @@ class probe_set /** * @param key an rvalue reference to the key to be inserted into the * table + * @return an iterator to the item inserted */ - void emplace(Key&& key) + iterator emplace(Key&& key) { if (alpha_.denominator * (keys_.size() + 1) >= alpha_.numerator * occupancy_.size()) @@ -208,6 +217,18 @@ class probe_set if (keys_.size() == keys_.capacity()) keys_.reserve(keys_.size() + (keys_.size() + 1) / 2); keys_.emplace_back(std::move(key)); + + return {this, idx}; + } + + /** + * @param key a reference to the key to be inserted into the table + * @return an iterator to the item inserted + */ + iterator insert(const Key& key) + { + Key to_insert{key}; + return emplace(std::move(to_insert)); } /** diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 812ce4f41..c27b86ca0 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -4,22 +4,27 @@ */ #include "cpptoml.h" -#include "index/chunk_handler.h" +#include "analyzers/analyzer.h" +#include "corpus/corpus.h" +#include "index/chunk_reader.h" #include "index/disk_index_impl.h" #include "index/forward_index.h" #include "index/inverted_index.h" #include "index/metadata_writer.h" #include "index/postings_file.h" #include "index/postings_file_writer.h" +#include "index/postings_inverter.h" #include "index/string_list.h" #include "index/string_list_writer.h" #include "index/vocabulary_map.h" +#include "index/vocabulary_map_writer.h" #include "io/libsvm_parser.h" #include "parallel/thread_pool.h" #include "util/disk_vector.h" #include "util/mapping.h" #include "util/pimpl.tcc" #include "util/shim.h" +#include "util/time.h" namespace meta { @@ -37,6 +42,26 @@ class forward_index::impl */ impl(forward_index* idx); + /** + * Tokenizes the documents in the corpus in parallel, yielding + * num_threads number of forward_index chunks that then need to be + * merged. + */ + void tokenize_docs(corpus::corpus* corpus, + const analyzers::analyzer& analyzer, + metadata_writer& mdata_writer, uint64_t ram_budget); + + /** + * Merges together num_chunks number of intermediate chunks, using the + * given vocabulary to do the renumbering. + * + * The vocabulary mapping will assign ids in insertion order, but we + * will want our ids in lexicographic order for vocabulary_map to + * work, so this function will sort the vocabulary and perform a + * re-numbering of the old ids. + */ + void merge_chunks(size_t num_chunks, util::probe_set vocab); + /** * @param config the configuration settings for this index */ @@ -79,7 +104,8 @@ class forward_index::impl /// the postings file util::optional> postings_; + forward_index::secondary_key_type, double>> + postings_; private: /// Pointer to the forward_index this is an implementation of @@ -177,22 +203,48 @@ void forward_index::create_index(const std::string& config_file) } else { - LOG(info) << "Creating index by uninverting: " << index_name() << ENDLG; - { - // Ensure all files are flushed before uninverting - make_index(config_file); - } - auto inv_idx = make_index(config_file); - uint64_t ram_budget = 1024; if (auto cfg_ram_budget = config.get_as("indexer-ram-budget")) ram_budget = static_cast(*cfg_ram_budget); - fwd_impl_->create_uninverted_metadata(inv_idx->index_name()); - // RAM budget is given in MB - fwd_impl_->uninvert(*inv_idx, ram_budget * 1024 * 1024); - impl_->load_term_id_mapping(); - fwd_impl_->total_unique_terms_ = impl_->total_unique_terms(); + auto uninvert = config.get_as("uninvert"); + if (uninvert && *uninvert) + { + LOG(info) << "Creating index by uninverting: " << index_name() + << ENDLG; + { + // Ensure all files are flushed before uninverting + make_index(config_file); + } + auto inv_idx = make_index(config_file); + + fwd_impl_->create_uninverted_metadata(inv_idx->index_name()); + // RAM budget is given in MB + fwd_impl_->uninvert(*inv_idx, ram_budget * 1024 * 1024); + impl_->load_term_id_mapping(); + fwd_impl_->total_unique_terms_ = impl_->total_unique_terms(); + } + else + { + LOG(info) << "Creating forward index: " << index_name() << ENDLG; + + auto docs = corpus::corpus::load(config_file); + + { + auto analyzer = analyzers::analyzer::load(config); + + metadata_writer mdata_writer{index_name(), docs->size(), + docs->schema()}; + + impl_->load_labels(docs->size()); + + // RAM budget is given in MB + fwd_impl_->tokenize_docs(docs.get(), *analyzer, mdata_writer, + ram_budget * 1024 * 1024); + impl_->load_term_id_mapping(); + fwd_impl_->total_unique_terms_ = impl_->total_unique_terms(); + } + } } impl_->load_label_id_mapping(); @@ -210,6 +262,180 @@ void forward_index::create_index(const std::string& config_file) LOG(info) << "Done creating index: " << index_name() << ENDLG; } +void forward_index::impl::tokenize_docs(corpus::corpus* docs, + const analyzers::analyzer& ana, + metadata_writer& mdata_writer, + uint64_t ram_budget) +{ + std::mutex io_mutex; + std::mutex corpus_mutex; + std::mutex vocab_mutex; + printing::progress progress{" > Tokenizing Docs: ", docs->size()}; + + util::probe_set vocab; + bool exceeded_budget = false; + auto task = [&](size_t chunk_id) + { + std::ofstream chunk{idx_->index_name() + "/chunk-" + + std::to_string(chunk_id), + std::ios::binary}; + auto analyzer = ana.clone(); + while (true) + { + util::optional doc; + { + std::lock_guard lock{corpus_mutex}; + + if (!docs->has_next()) + return; + + doc = docs->next(); + progress(doc->id()); + } + + analyzer->tokenize(*doc); + + // warn if there is an empty document + if (doc->counts().empty()) + { + std::lock_guard lock{io_mutex}; + LOG(progress) << '\n' << ENDLG; + LOG(warning) << "Empty document (id = " << doc->id() + << ") generated!" << ENDLG; + } + + mdata_writer.write(doc->id(), doc->length(), doc->counts().size(), + doc->mdata()); + idx_->impl_->set_label(doc->id(), doc->label()); + + forward_index::postings_data_type::count_t counts; + counts.reserve(doc->counts().size()); + { + std::lock_guard lock{vocab_mutex}; + for (const auto& count : doc->counts()) + { + auto it = vocab.find(count.first); + if (it == vocab.end()) + it = vocab.insert(count.first); + + counts.emplace_back(it.index(), count.second); + } + + if (!exceeded_budget && vocab.bytes_used() > ram_budget) + { + exceeded_budget = true; + std::lock_guard io_lock{io_mutex}; + LOG(progress) << '\n' << ENDLG; + LOG(warning) + << "Exceeding RAM budget; indexing cannot " + "proceed without exceeding specified RAM budget" + << ENDLG; + } + } + + forward_index::postings_data_type pdata{doc->id()}; + pdata.set_counts(std::move(counts)); + pdata.write_packed(chunk); + } + }; + + parallel::thread_pool pool; + auto num_threads = pool.thread_ids().size(); + std::vector> futures; + futures.reserve(num_threads); + for (size_t i = 0; i < num_threads; ++i) + futures.emplace_back(pool.submit_task(std::bind(task, i))); + + for (auto& fut : futures) + fut.get(); + + progress.end(); + + merge_chunks(num_threads, std::move(vocab)); +} + +void forward_index::impl::merge_chunks(size_t num_chunks, + util::probe_set vocab) +{ + auto keys = vocab.extract_keys(); + // vocab is now empty, but has enough space for the vocabulary + + { + // we now create a new vocab with the keys in sorted order + vocabulary_map_writer writer{idx_->index_name() + "/" + + idx_->impl_->files[TERM_IDS_MAPPING]}; + auto sorted_keys = keys; + std::sort(sorted_keys.begin(), sorted_keys.end()); + for (const auto& key : sorted_keys) + { + // in memory vocab + vocab.insert(key); + + // on disk vocab + writer.insert(key); + } + } + + // term_id in a chunk file corresponds to the index into the keys + // vector, which we can then use the new vocab to map to an index + postings_file_writer writer{ + idx_->index_name() + "/" + idx_->impl_->files[POSTINGS], vocab.size()}; + + using input_chunk = chunk_reader; + std::vector chunks; + chunks.reserve(num_chunks); + for (size_t i = 0; i < num_chunks; ++i) + chunks.emplace_back(idx_->index_name() + "/chunk-" + std::to_string(i)); + + printing::progress progress{ + " > Merging postings: ", + std::accumulate(chunks.begin(), chunks.end(), 0ul, + [](uint64_t acc, const input_chunk& chunk) + { + return acc + chunk.total_bytes(); + })}; + + uint64_t total_read + = std::accumulate(chunks.begin(), chunks.end(), 0ul, + [](uint64_t acc, const input_chunk& chunk) + { + return acc + chunk.bytes_read(); + }); + + while (!chunks.empty()) + { + progress(total_read); + + // find the lowest doc id + auto min_chunk = std::min_element(chunks.begin(), chunks.end()); + + // steal the postings and advance the chunk + auto to_write = min_chunk->postings(); + auto before = min_chunk->bytes_read(); + ++*min_chunk; + total_read += min_chunk->bytes_read() - before; + + // if there were no more postings, remove the chunk for the input + if (!*min_chunk) + chunks.erase(min_chunk); + + // renumber the postings + forward_index::postings_data_type::count_t counts; + counts.reserve(to_write.counts().size()); + for (const auto& count : to_write.counts()) + { + const auto& key = keys.at(count.first); + auto it = vocab.find(key); + assert(it != vocab.end()); + counts.emplace_back(it.index(), count.second); + } + + // set the new counts and write to the postings file + to_write.set_counts(std::move(counts)); + writer.write(to_write); + } +} + void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) { auto prefix = config.get_as("prefix"); @@ -229,7 +455,8 @@ void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) total_unique_terms_ = 0; { - postings_file_writer out{filename, num_docs}; + postings_file_writer out{filename, + num_docs}; // make md_writer with empty schema metadata_writer md_writer{idx_->index_name(), num_docs, {}}; @@ -259,11 +486,10 @@ void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) length += count.second; } - pdata.set_counts(counts); - out.write(pdata); + pdata.set_counts(std::move(counts)); + out.write(pdata); - md_writer.write(d_id, static_cast(length), num_unique, - {}); + md_writer.write(d_id, length, num_unique, {}); ++d_id; } @@ -308,19 +534,19 @@ uint64_t forward_index::unique_terms() const auto forward_index::search_primary(doc_id d_id) const -> std::shared_ptr { - return fwd_impl_->postings_->find(d_id); + return fwd_impl_->postings_->find(d_id); } util::optional> forward_index::stream_for(doc_id d_id) const { - return fwd_impl_->postings_->find_stream(d_id); + return fwd_impl_->postings_->find_stream(d_id); } void forward_index::impl::uninvert(const inverted_index& inv_idx, uint64_t ram_budget) { - chunk_handler handler{idx_->index_name()}; + postings_inverter handler{idx_->index_name()}; { auto producer = handler.make_producer(ram_budget); for (term_id t_id{0}; t_id < inv_idx.unique_terms(); ++t_id) @@ -345,9 +571,10 @@ void forward_index::impl::compress(const std::string& filename, // can calculate the size of the compressed file and delete the // uncompressed version at the end { - postings_file_writer out{filename, num_docs}; + postings_file_writer out{filename, + num_docs}; - forward_index::postings_data_type pdata; + forward_index::index_pdata_type pdata; auto length = filesystem::file_size(ucfilename); std::ifstream in{ucfilename, std::ios::binary}; @@ -369,10 +596,19 @@ void forward_index::impl::compress(const std::string& filename, for (doc_id d_id{last_id + 1}; d_id < pdata.primary_key(); ++d_id) { forward_index::postings_data_type pd{d_id}; - out.write(pd); + out.write(pd); } - out.write(pdata); + // convert from int to double for feature values + forward_index::postings_data_type::count_t counts; + counts.reserve(pdata.counts().size()); + for (const auto& count : pdata.counts()) + counts.emplace_back(count.first, count.second); + + forward_index::postings_data_type to_write{pdata.primary_key()}; + to_write.set_counts(std::move(counts)); + out.write(to_write); + last_id = pdata.primary_key(); } } diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 39ef510f4..dc270c913 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -5,15 +5,13 @@ */ #include "corpus/corpus.h" -#include "index/chunk_handler.h" #include "index/disk_index_impl.h" #include "index/inverted_index.h" #include "corpus/metadata_parser.h" #include "index/metadata_writer.h" #include "index/postings_file.h" #include "index/postings_file_writer.h" -#include "index/string_list.h" -#include "index/string_list_writer.h" +#include "index/postings_inverter.h" #include "index/vocabulary_map.h" #include "index/vocabulary_map_writer.h" #include "parallel/thread_pool.h" @@ -47,14 +45,14 @@ class inverted_index::impl /** * @param docs The documents to be tokenized - * @param handler The chunk handler for this index + * @param inverter The postings inverter for this index * @param mdata_parser The parser for reading metadata * @param mdata_writer The writer for metadata * @param ram_budget The total **estimated** RAM budget * @return the number of chunks created */ void tokenize_docs(corpus::corpus* docs, - chunk_handler& handler, + postings_inverter& inverter, metadata_writer& mdata_writer, uint64_t ram_budget); /** @@ -128,7 +126,7 @@ void inverted_index::create_index(const std::string& config_file) if (cfg_ram_budget) ram_budget = static_cast(*cfg_ram_budget); - chunk_handler handler{index_name()}; + postings_inverter inverter{index_name()}; { metadata_writer mdata_writer{index_name(), docs->size(), docs->schema()}; @@ -136,17 +134,17 @@ void inverted_index::create_index(const std::string& config_file) impl_->load_labels(num_docs); // RAM budget is given in megabytes - inv_impl_->tokenize_docs(docs.get(), handler, mdata_writer, + inv_impl_->tokenize_docs(docs.get(), inverter, mdata_writer, ram_budget * 1024 * 1024); } - handler.merge_chunks(); + inverter.merge_chunks(); LOG(info) << "Created uncompressed postings file " << index_name() << impl_->files[POSTINGS] << " (" - << printing::bytes_to_units(handler.final_size()) << ")" << ENDLG; + << printing::bytes_to_units(inverter.final_size()) << ")" << ENDLG; - uint64_t num_unique_terms = handler.unique_primary_keys(); + uint64_t num_unique_terms = inverter.unique_primary_keys(); inv_impl_->compress(index_name() + impl_->files[POSTINGS], num_unique_terms); @@ -171,17 +169,16 @@ void inverted_index::load_index() inv_impl_->load_postings(); } -void inverted_index::impl::tokenize_docs(corpus::corpus* docs, - chunk_handler& handler, - metadata_writer& mdata_writer, - uint64_t ram_budget) +void inverted_index::impl::tokenize_docs( + corpus::corpus* docs, postings_inverter& inverter, + metadata_writer& mdata_writer, uint64_t ram_budget) { std::mutex mutex; printing::progress progress{" > Tokenizing Docs: ", docs->size()}; auto task = [&](uint64_t ram_budget) { - auto producer = handler.make_producer(ram_budget); + auto producer = inverter.make_producer(ram_budget); auto analyzer = analyzer_->clone(); while (true) { @@ -239,12 +236,13 @@ void inverted_index::impl::compress(const std::string& filename, // can calculate the size of the compressed file and delete the // uncompressed version at the end { - postings_file_writer out{filename, num_unique_terms}; + postings_file_writer out{ + filename, num_unique_terms}; vocabulary_map_writer vocab{idx_->index_name() + idx_->impl_->files[TERM_IDS_MAPPING]}; - postings_data pdata; + inverted_index::index_pdata_type pdata; auto length = filesystem::file_size(ucfilename); std::ifstream in{ucfilename, std::ios::binary}; uint64_t byte_pos = 0; diff --git a/src/test/forward_index_test.cpp b/src/test/forward_index_test.cpp index b56187ecd..146c28d03 100644 --- a/src/test/forward_index_test.cpp +++ b/src/test/forward_index_test.cpp @@ -128,9 +128,25 @@ int forward_index_tests() num_failed += testing::run_test("forward-index-read-file-corpus", [&]() { ceeaus_forward_test(); - system("rm -rf ceeaus-* test-config.toml"); }); + num_failed += testing::run_test("forward-index-build-uninvert", [&]() + { + system("rm -rf ceeaus-*"); + + // hack to inject "uninvert = true" at the top of the config file + auto cfg_contents = filesystem::file_text("test-config.toml"); + cfg_contents = "uninvert = true\n" + cfg_contents; + filesystem::delete_file("test-config.toml"); + { + std::ofstream file{"test-config.toml"}; + file.write(cfg_contents.c_str(), cfg_contents.size()); + } + + ceeaus_forward_test(); + }); + + filesystem::delete_file("test-config.toml"); create_config("line"); num_failed += testing::run_test("forward-index-build-line-corpus", [&]() @@ -145,6 +161,7 @@ int forward_index_tests() system("rm -rf ceeaus-* test-config.toml"); }); + create_libsvm_config(); num_failed += testing::run_test("forward-index-build-libsvm", [&]() From 582b4451a822773a7903130078c4445b53de76aa Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 4 Sep 2015 00:47:11 -0500 Subject: [PATCH 216/481] Lock the right mutex for progress reporting in forward_index. --- src/index/forward_index.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index c27b86ca0..e01acafc3 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -290,6 +290,9 @@ void forward_index::impl::tokenize_docs(corpus::corpus* docs, return; doc = docs->next(); + } + { + std::lock_guard lock{io_mutex}; progress(doc->id()); } From d076b3a1e7450a56103bbbede5700a53be8e017f Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 4 Sep 2015 02:08:02 -0500 Subject: [PATCH 217/481] Switch back to meta-toolkit fork of porter2_stemmer. I forgot I don't have push permissions on smassung/porter2_stemmer... --- .gitmodules | 2 +- deps/porter2_stemmer | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index bf53e8358..b90f4c37e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,7 +4,7 @@ branch = master [submodule "deps/porter2_stemmer"] path = deps/porter2_stemmer - url = https://github.com/smassung/porter2_stemmer.git + url = https://github.com/meta-toolkit/porter2_stemmer.git branch = master [submodule "deps/libsvm-modules"] path = deps/libsvm-modules diff --git a/deps/porter2_stemmer b/deps/porter2_stemmer index ca29419b2..a9718c892 160000 --- a/deps/porter2_stemmer +++ b/deps/porter2_stemmer @@ -1 +1 @@ -Subproject commit ca29419b2810f39391a8260bd3f1ff862ace9764 +Subproject commit a9718c892a935baff774dfb450f21e864a14c311 From a0db9e4c0c81b1027cb74dc656cb11d437d73148 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 4 Sep 2015 02:08:41 -0500 Subject: [PATCH 218/481] Fix debug compilation. --- src/index/forward_index.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index e01acafc3..9d5ea157d 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -321,7 +321,7 @@ void forward_index::impl::tokenize_docs(corpus::corpus* docs, if (it == vocab.end()) it = vocab.insert(count.first); - counts.emplace_back(it.index(), count.second); + counts.emplace_back(term_id{it.index()}, count.second); } if (!exceeded_budget && vocab.bytes_used() > ram_budget) @@ -430,7 +430,7 @@ void forward_index::impl::merge_chunks(size_t num_chunks, const auto& key = keys.at(count.first); auto it = vocab.find(key); assert(it != vocab.end()); - counts.emplace_back(it.index(), count.second); + counts.emplace_back(term_id{it.index()}, count.second); } // set the new counts and write to the postings file From b1851281900fadc070c3efc15aea5717b231a2c9 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 4 Sep 2015 02:08:58 -0500 Subject: [PATCH 219/481] Work around a bug in libstdc++ in gcc < 4.8.3. A static assertion would trigger if you have a hash function that is nonempty (like e.g. util::string_view). We now detect whether the standard library supports a non-empty hasher and, if it does not, we just use a fixed seed for the hash instead. --- CMakeLists.txt | 15 +++++++++++++++ include/util/string_view.h | 5 +++++ 2 files changed, 20 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e983f064a..564afa551 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -222,6 +222,21 @@ if(META_HAS_STD_MAKE_UNIQUE) -DMETA_HAS_STD_MAKE_UNIQUE) endif() +# work around a bug in libstdc++ provided with gcc < 4.8.3 where a static +# assertion fires when you have a non-empty hash functor +check_cxx_source_compiles(" +#include +struct nonempty_hasher : public std::hash { int i = 3; }; +int main() { + std::unordered_set s; + return 0; +}" META_HAS_NONEMPTY_HASH_SUPPORT) + +if (META_HAS_NONEMPTY_HASH_SUPPORT) + target_compile_definitions(meta-definitions INTERFACE + -DMETA_HAS_NONEMPTY_HASH_SUPPORT) +endif() + if(ICU_VERSION VERSION_LESS "4.4") target_compile_definitions(meta-definitions INTERFACE -DMETA_ICU_NO_TEMP_SUBSTRING) diff --git a/include/util/string_view.h b/include/util/string_view.h index 6cb0629d4..92084d433 100644 --- a/include/util/string_view.h +++ b/include/util/string_view.h @@ -614,11 +614,16 @@ namespace std template struct hash> { +#if META_HAS_NONEMPTY_HASH_SUPPORT meta::util::murmur_hash<> hasher; +#endif size_t operator()( const meta::util::basic_string_view& view) const noexcept { +#ifndef META_HAS_NONEMPTY_HASH_SUPPORT + meta::util::murmur_hash<> hasher{97562527}; +#endif return hasher(reinterpret_cast(view.data()), view.size()); } From aefefa0749cc1fcf4c00e066ad8505ed3630da0a Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 4 Sep 2015 16:05:28 -0500 Subject: [PATCH 220/481] Switch back to upstream for porter2_stemmer. --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index b90f4c37e..bf53e8358 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,7 +4,7 @@ branch = master [submodule "deps/porter2_stemmer"] path = deps/porter2_stemmer - url = https://github.com/meta-toolkit/porter2_stemmer.git + url = https://github.com/smassung/porter2_stemmer.git branch = master [submodule "deps/libsvm-modules"] path = deps/libsvm-modules From 17f0ab44a21d54d191b29cc512a1079345464d3c Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 09:30:19 -0500 Subject: [PATCH 221/481] coding and style tweaks re: #107 --- include/lm/sentence.h | 10 ++++- include/lm/static_probe_map.h | 15 ++++--- src/analyzers/analyzer.cpp | 7 +++- src/features/feature_selector.cpp | 53 ++++++++++++------------ src/lm/diff.cpp | 10 ++--- src/lm/language_model.cpp | 2 +- src/lm/sentence.cpp | 16 ++++++- src/lm/tools/diff_test.cpp | 3 +- src/sequence/crf/tools/pos_tokenizer.cpp | 5 +-- src/topics/tools/topic_corpus.cpp | 10 ++--- 10 files changed, 73 insertions(+), 58 deletions(-) diff --git a/include/lm/sentence.h b/include/lm/sentence.h index c32ea4849..25b73fdf6 100644 --- a/include/lm/sentence.h +++ b/include/lm/sentence.h @@ -86,9 +86,9 @@ class sentence */ const std::vector& operations() const; - std::string front() const; + const std::string& front() const; - std::string back() const; + const std::string& back() const; void push_front(const std::string& token); @@ -98,6 +98,12 @@ class sentence void pop_back(); + template + void emplace_front(Args&&... args); + + template + void emplace_back(Args&&... args); + /** * @return an iterator to the beginning of the sequence */ diff --git a/include/lm/static_probe_map.h b/include/lm/static_probe_map.h index 2110f2380..24a1c7045 100644 --- a/include/lm/static_probe_map.h +++ b/include/lm/static_probe_map.h @@ -76,16 +76,15 @@ class static_probe_map * Helper function to hash a string with util::murmur_hash */ uint64_t hash(const std::string& str) const; +}; +/** + * Basic exception for static_probe_map interactions. + */ +class static_probe_map_exception : public std::runtime_error +{ public: - /** - * Basic exception for static_probe_map interactions. - */ - class static_probe_map_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using std::runtime_error::runtime_error; }; } } diff --git a/src/analyzers/analyzer.cpp b/src/analyzers/analyzer.cpp index cae30abb9..21c42fdf4 100644 --- a/src/analyzers/analyzer.cpp +++ b/src/analyzers/analyzer.cpp @@ -6,7 +6,12 @@ #include "analyzers/filter_factory.h" #include "analyzers/multi_analyzer.h" #include "analyzers/token_stream.h" -#include "analyzers/filters/all.h" +#include "analyzers/filters/alpha_filter.h" +#include "analyzers/filters/empty_sentence_filter.h" +#include "analyzers/filters/length_filter.h" +#include "analyzers/filters/list_filter.h" +#include "analyzers/filters/lowercase_filter.h" +#include "analyzers/filters/porter2_stemmer.h" #include "analyzers/tokenizers/icu_tokenizer.h" #include "corpus/document.h" #include "cpptoml.h" diff --git a/src/features/feature_selector.cpp b/src/features/feature_selector.cpp index 0b46e2924..8430ee783 100644 --- a/src/features/feature_selector.cpp +++ b/src/features/feature_selector.cpp @@ -11,7 +11,7 @@ #include "parallel/parallel_for.h" #include "features/feature_selector.h" #include "index/postings_data.h" -#include "io/binary.h" +#include "io/packed.h" namespace meta { @@ -22,7 +22,8 @@ feature_selector::feature_selector(const std::string& prefix, : prefix_{prefix}, idx_{std::move(idx)}, selected_{prefix_ + ".selected", idx_->unique_terms()} -{ /* nothing */ +{ + // nothing } void feature_selector::init(uint64_t features_per_class) @@ -62,7 +63,7 @@ void feature_selector::score_all() parallel::parallel_for( scores.begin(), scores.end(), [&](std::vector& v) { - std::sort(v.begin(), v.end(), [&](const pair_t& a, const pair_t& b) + std::sort(v.begin(), v.end(), [](const pair_t& a, const pair_t& b) { return a.second > b.second; }); @@ -71,38 +72,35 @@ void feature_selector::score_all() for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) { // write (term_id, score) pairs - std::ofstream out{prefix_ + "." + std::to_string(lbl + 1)}; + std::ofstream out{prefix_ + "." + std::to_string(lbl + 1), + std::ios::binary}; for (auto& score : scores[lbl]) { - io::write_binary(out, score.first); - io::write_binary(out, score.second); + io::packed::write(out, score.first); + io::packed::write(out, score.second); } } } void feature_selector::select(uint64_t features_per_class /* = 20 */) { + // zero out old vector + for (auto& b : selected_) + b = false; + term_id id; double score; - std::unordered_set terms; for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) { - std::ifstream in{prefix_ + "." + std::to_string(lbl + 1)}; + std::ifstream in{prefix_ + "." + std::to_string(lbl + 1), + std::ios::binary}; for (uint64_t i = 0; i < features_per_class; ++i) { - io::read_binary(in, id); - io::read_binary(in, score); - terms.insert(id); + io::packed::read(in, id); + io::packed::read(in, score); + selected_[id] = true; } } - - // zero out old vector - for (auto& b : selected_) - b = false; - - // select new features - for (auto& term : terms) - selected_[term] = true; } bool feature_selector::selected(term_id term) const @@ -112,6 +110,10 @@ bool feature_selector::selected(term_id term) const void feature_selector::select_percent(double p /* = 0.05 */) { + if (p <= 0.0 || p >= 1.0) + throw feature_selector_exception{ + "select_percent needs a value p, 0 < p < 1"}; + double num_features = p * idx_->unique_terms(); uint64_t per_class = num_features / idx_->num_labels(); // truncate to int select(per_class); @@ -121,7 +123,7 @@ void feature_selector::calc_probs() { printing::progress prog{" > Calculating feature probs: ", idx_->num_docs()}; uint64_t total_terms = 0; - for (doc_id did = doc_id{0}; did < idx_->num_docs(); ++did) + for (doc_id did{0}; did < idx_->num_docs(); ++did) { prog(did); auto lid = idx_->lbl_id(did); @@ -150,20 +152,19 @@ void feature_selector::print_summary(uint64_t k /* = 20 */) const { term_id tid; double score; - for (uint64_t lbl = 0; lbl < idx_->num_labels(); ++lbl) + for (auto lbl = 1_lid; lbl <= idx_->num_labels(); ++lbl) { std::cout << std::endl << "Top " << k << " features for \"" - << idx_->class_label_from_id(static_cast(lbl + 1)) - << "\":" << std::endl + << idx_->class_label_from_id(lbl) << "\":" << std::endl << "===============================" << std::endl; // read (term_id, score) pairs - std::ifstream in{prefix_ + "." + std::to_string(lbl + 1)}; + std::ifstream in{prefix_ + "." + std::to_string(lbl), std::ios::binary}; for (uint64_t i = 0; i < k; ++i) { - io::read_binary(in, tid); - io::read_binary(in, score); + io::packed::read(in, tid); + io::packed::read(in, score); std::cout << (i + 1) << ". " << idx_->term_text(tid) << " (" << score << ")" << std::endl; } diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 98061f7db..ad31530e5 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -7,6 +7,7 @@ #include #include "lm/diff.h" #include "porter2_stemmer.h" +#include "utf/utf.h" namespace meta { @@ -154,7 +155,7 @@ template void diff::insert(const sentence& sent, size_t idx, PQ& candidates, uint64_t depth) { - for (auto& fw : fwords_) + for (const auto& fw : fwords_) { sentence ins_cpy{sent}; ins_cpy.insert(idx, fw, base_penalty_ + insert_penalty_); @@ -239,12 +240,9 @@ void diff::set_stems(const cpptoml::table& config) std::ifstream in{prefix + "/" + dataset + "/" + dataset + ".dat"}; std::string token; while (in >> token) - { - std::transform(token.begin(), token.end(), token.begin(), ::tolower); - vocab.insert(token); - } + vocab.insert(utf::foldcase(token)); - for (auto& t : vocab) + for (const auto& t : vocab) { std::string stemmed{t}; Porter2Stemmer::stem(stemmed); diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 7183475cf..7e8b8540d 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -124,7 +124,7 @@ std::vector> sentence candidate = prev; candidate.push_back("word"); // the last item is replaced each iteration - for (auto& word : vocabulary_) + for (const auto& word : vocabulary_) { auto candidate = sentence{prev.to_string() + " " + word}; candidates.emplace_back(word, log_prob(candidate)); diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 9496f77a0..47b21968a 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -119,12 +119,12 @@ const std::vector& sentence::operations() const return ops_; } -std::string sentence::front() const +const std::string& sentence::front() const { return tokens_.front(); } -std::string sentence::back() const +const std::string& sentence::back() const { return tokens_.back(); } @@ -149,6 +149,18 @@ void sentence::pop_back() tokens_.pop_back(); } +template +void sentence::emplace_front(Args&&... args) +{ + tokens_.emplace_front(std::forward(args)...); +} + +template +void sentence::emplace_back(Args&&... args) +{ + tokens_.emplace_back(std::forward(args)...); +} + sentence::iterator sentence::begin() { return tokens_.begin(); diff --git a/src/lm/tools/diff_test.cpp b/src/lm/tools/diff_test.cpp index 550955ef8..63f7a6252 100644 --- a/src/lm/tools/diff_test.cpp +++ b/src/lm/tools/diff_test.cpp @@ -34,9 +34,8 @@ int main(int argc, char* argv[]) std::string line; size_t done = 0; double do_nothing = 0; - while (in) + while (std::getline(in, line)) { - std::getline(in, line); if (line.empty()) continue; diff --git a/src/sequence/crf/tools/pos_tokenizer.cpp b/src/sequence/crf/tools/pos_tokenizer.cpp index d25bba3a5..232e3ae62 100644 --- a/src/sequence/crf/tools/pos_tokenizer.cpp +++ b/src/sequence/crf/tools/pos_tokenizer.cpp @@ -87,10 +87,7 @@ int main(int argc, char* argv[]) if (ptb_special.find(word) != ptb_special.end()) std::cout << word << " "; else if (keep_list.find(word) != keep_list.end()) - { - word = utf::foldcase(word); - std::cout << word << " "; - } + std::cout << utf::foldcase(word) << " "; else std::cout << analyzer.tag(obs.label()) << " "; } diff --git a/src/topics/tools/topic_corpus.cpp b/src/topics/tools/topic_corpus.cpp index 319cb948f..b485e209e 100644 --- a/src/topics/tools/topic_corpus.cpp +++ b/src/topics/tools/topic_corpus.cpp @@ -30,9 +30,8 @@ std::vector get_topic_ids(std::ifstream& thetas) { std::vector topic_ids; std::string line; - while (thetas) + while (std::getline(thetas, line)) { - std::getline(thetas, line); if (line.empty()) continue; std::istringstream stream{line}; @@ -42,9 +41,8 @@ std::vector get_topic_ids(std::ifstream& thetas) std::string to_split; size_t best_topic = 0; double best_prob = 0; - while (stream) + while (stream >> to_split) { - stream >> to_split; if (to_split.length() == 0) continue; size_t idx = to_split.find_first_of(':'); @@ -106,9 +104,9 @@ void create_topic_corpus(const std::string& prefix, const std::string& dataset, { out_dist << label.first; double total = 0.0; - for (auto& count : label.second) + for (const auto& count : label.second) total += count; - for (auto& count : label.second) + for (const auto& count : label.second) out_dist << "\t" << count / total; out_dist << std::endl; } From 5c3fc95dca534e9a8581ee050bf07e4b1ed23482 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 10:28:53 -0500 Subject: [PATCH 222/481] move diff_analyzer into lm/ dir re: #104 --- include/analyzers/all.h | 1 - include/{ => lm}/analyzers/diff_analyzer.h | 0 src/analyzers/CMakeLists.txt | 2 -- src/analyzers/analyzer_factory.cpp | 1 - src/lm/CMakeLists.txt | 1 + src/lm/analyzers/CMakeLists.txt | 4 ++++ src/{ => lm}/analyzers/diff_analyzer.cpp | 2 +- src/test/CMakeLists.txt | 2 +- 8 files changed, 7 insertions(+), 6 deletions(-) rename include/{ => lm}/analyzers/diff_analyzer.h (100%) create mode 100644 src/lm/analyzers/CMakeLists.txt rename src/{ => lm}/analyzers/diff_analyzer.cpp (97%) diff --git a/include/analyzers/all.h b/include/analyzers/all.h index 4c9e0b8b4..99f09f541 100644 --- a/include/analyzers/all.h +++ b/include/analyzers/all.h @@ -1,6 +1,5 @@ #include "analyzers/analyzer.h" #include "analyzers/multi_analyzer.h" -#include "analyzers/diff_analyzer.h" #include "analyzers/ngram/ngram_analyzer.h" #include "analyzers/ngram/ngram_word_analyzer.h" diff --git a/include/analyzers/diff_analyzer.h b/include/lm/analyzers/diff_analyzer.h similarity index 100% rename from include/analyzers/diff_analyzer.h rename to include/lm/analyzers/diff_analyzer.h diff --git a/src/analyzers/CMakeLists.txt b/src/analyzers/CMakeLists.txt index 757bd55b3..7cf7e05af 100644 --- a/src/analyzers/CMakeLists.txt +++ b/src/analyzers/CMakeLists.txt @@ -6,11 +6,9 @@ add_subdirectory(tools) add_library(meta-analyzers analyzer.cpp analyzer_factory.cpp - diff_analyzer.cpp multi_analyzer.cpp ngram/ngram_analyzer.cpp ngram/ngram_word_analyzer.cpp) target_link_libraries(meta-analyzers meta-corpus meta-filters - meta-language-model meta-tokenizers) diff --git a/src/analyzers/analyzer_factory.cpp b/src/analyzers/analyzer_factory.cpp index b1b22d3b4..3c9fd28b0 100644 --- a/src/analyzers/analyzer_factory.cpp +++ b/src/analyzers/analyzer_factory.cpp @@ -21,7 +21,6 @@ analyzer_factory::analyzer_factory() { // built-in analyzers register_analyzer(); - register_analyzer(); } } } diff --git a/src/lm/CMakeLists.txt b/src/lm/CMakeLists.txt index 48473b38e..464d2f879 100644 --- a/src/lm/CMakeLists.txt +++ b/src/lm/CMakeLists.txt @@ -1,6 +1,7 @@ project(meta-language-model) add_subdirectory(tools) +add_subdirectory(analyzers) add_library(meta-language-model language_model.cpp diff.cpp diff --git a/src/lm/analyzers/CMakeLists.txt b/src/lm/analyzers/CMakeLists.txt new file mode 100644 index 000000000..756fac1f2 --- /dev/null +++ b/src/lm/analyzers/CMakeLists.txt @@ -0,0 +1,4 @@ +project(meta-lm-analyzers) + +add_library(meta-lm-analyzers diff_analyzer.cpp) +target_link_libraries(meta-lm-analyzers meta-analyzers meta-language-model) diff --git a/src/analyzers/diff_analyzer.cpp b/src/lm/analyzers/diff_analyzer.cpp similarity index 97% rename from src/analyzers/diff_analyzer.cpp rename to src/lm/analyzers/diff_analyzer.cpp index c1505d99a..46ae65c42 100644 --- a/src/analyzers/diff_analyzer.cpp +++ b/src/lm/analyzers/diff_analyzer.cpp @@ -7,7 +7,7 @@ #include #include "corpus/document.h" -#include "analyzers/diff_analyzer.h" +#include "lm/analyzers/diff_analyzer.h" #include "analyzers/token_stream.h" namespace meta diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 1e81ae6c1..784cddcab 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -19,7 +19,7 @@ add_library(meta-testing analyzer_test.cpp vocabulary_map_test.cpp parser_test.cpp) target_link_libraries(meta-testing meta-index meta-classify meta-parser - meta-features) + meta-features meta-language-model) set(UNIT_TEST_EXE unit-test) include(unit_tests.cmake) From 0d08a3f678bf7a58a44fe8fa937bb897fd8a8da9 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 10:33:31 -0500 Subject: [PATCH 223/481] simplify feature selector factory registration --- include/features/selector_factory.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/features/selector_factory.h b/include/features/selector_factory.h index d1b6a2ee6..eaa6b7e02 100644 --- a/include/features/selector_factory.h +++ b/include/features/selector_factory.h @@ -93,12 +93,7 @@ std::unique_ptr template void register_selector() { - selector_factory::get().add(Selector::id, - [](const cpptoml::table& config, - std::shared_ptr idx) - { - return make_selector(config, std::move(idx)); - }); + selector_factory::get().add(Selector::id, make_selector); } } } From b1221ed3cb017fd9f3a8801e33bb48642544ff6f Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 10:58:06 -0500 Subject: [PATCH 224/481] update comments and configurable fields in lm::diff --- include/lm/analyzers/diff_analyzer.h | 3 +- include/lm/diff.h | 59 +++++++++++++++++++++++----- src/lm/diff.cpp | 9 +++++ 3 files changed, 61 insertions(+), 10 deletions(-) diff --git a/include/lm/analyzers/diff_analyzer.h b/include/lm/analyzers/diff_analyzer.h index 0e7bc0571..613856109 100644 --- a/include/lm/analyzers/diff_analyzer.h +++ b/include/lm/analyzers/diff_analyzer.h @@ -21,7 +21,8 @@ namespace analyzers { /** - * Analyzes documents using their tokenized words. + * Analyzes documents using lm::diff edits; see lm::diff for config file + * information and further explanation. */ class diff_analyzer : public util::clonable { diff --git a/include/lm/diff.h b/include/lm/diff.h index f0205bab5..66f582084 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -19,6 +19,38 @@ namespace meta { namespace lm { +/** + * Uses a language model to transform sentences given a reference text + * collection. These transformations can be used directly or can be employed as + * features to represent text data in a wide variety of text mining + * applications. + * @see Sean Massung and ChengXiang Zhai. 2015. "SyntacticDiff: Operator-Based + * Transformation for Comparative Text Mining" + * @see http://web.engr.illinois.edu/~massung1/files/bigdata-2015.pdf + * @note It is *very important* that the language model .arpa file and the input + * to lm::diff are tokenized in the same way! + * + * Required config parameters: + * ~~~toml + * [diff] + * n-value = 3 # e.g. + * max-edits = 2 # e.g., probably something in [1,5] + * function-words = "path-to-file.txt" # words that may be inserted into + * # the sentence + * ~~~ + * + * Optional config parameters: + * ~~~toml + * [diff] + * base-penalty = 0.1 + * insert-penalty = 0.1 + * substitute-penalty = 0.1 + * remove-penalty = 0.1 + * max-candidates = 20 + * lambda = 0.5 # balances scoring between perplexity and edits, in [0,1] + * lm-generate = false # use LM to insert likely words (may be slow!) + * ~~~ + */ class diff { public: @@ -34,7 +66,9 @@ class diff /** * @param sent The sentence object to inspect - * @return the index of the least-likely ngram according + * @return the index of the least-likely ngram according to perplexity + * Runtime is linear in the sentence length, since log_prob is called on + * each ngram in the sentence. */ uint64_t least_likely_ngram(const sentence& sent) const; @@ -58,6 +92,9 @@ class diff * @param sent The sentence to transform * @param use_lm * @return a sorted list of candidate corrections and their scores + * The runtime depends on the value of parameters set in the config file: + * - exponential in the maximum number of edits + * - linear in n and the sentence length */ std::vector> candidates(const sentence& sent, bool use_lm = true); @@ -127,10 +164,8 @@ class diff language_model lm_; - /// The order of the language model uint64_t n_val_; uint64_t max_edits_; - double base_penalty_; double insert_penalty_; double substitute_penalty_; @@ -141,19 +176,25 @@ class diff /// index. bool use_lm_; + /// map of "stem" -> [words that stem to "stem"] std::unordered_map> stems_; + + /// List of words that can be inserted into the sentence (default is + /// function words) std::vector fwords_; + + /// Keeps track of sentences that have already been generated so we don't + /// perform redundant calcualtions std::unordered_set seen_; - /// How many candidate sentences to store. - static constexpr uint64_t max_cand_size_ = 20; + /// How many candidate sentences to store when calling diff::candidates + uint64_t max_cand_size_; /// Balances perplexity and edit weights. - static constexpr double lambda_ = 0.5; + double lambda_; - /// Whether to insert likely words based on the language model. This is - /// currently turned off due to the LM representation making it inefficient. - static constexpr bool lm_generate_ = false; + /// Whether to insert likely words based on the language model. + bool lm_generate_; }; class diff_exception : public std::runtime_error diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index ad31530e5..12d0363e3 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -41,6 +41,15 @@ diff::diff(const cpptoml::table& config) : lm_{config} auto r_pen = table->get_as("remove-penalty"); remove_penalty_ = r_pen ? *r_pen : 0.0; + auto max_cand = table->get_as("max-candidates"); + max_cand_size_ = max_cand ? *max_cand : 20; + + auto lambda = table->get_as("lambda"); + lambda_ = lambda ? *lambda : 0.5; + + auto lm_gen = table->get_as("lm-generate"); + lm_generate_ = lm_gen ? *lm_gen : false; + set_stems(*table); set_function_words(*table); } From e9ba5e53b8b401e8256969d03ac7c24c1d678fc5 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 10:58:20 -0500 Subject: [PATCH 225/481] update tree features with paper reference --- .../parser/analyzers/featurizers/semi_skeleton_featurizer.h | 3 +++ include/parser/analyzers/featurizers/skeleton_featurizer.h | 3 +++ 2 files changed, 6 insertions(+) diff --git a/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h b/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h index fe5f3d322..f14fa9e2b 100644 --- a/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h +++ b/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h @@ -21,6 +21,9 @@ namespace analyzers /** * Tokenizes parse trees by keeping track of only a single node label and * the underlying tree structure. + * @see Sean Massung, ChengXiang Zhai, and Julia Hockenmaier. 2013. "Structural + * Parse Tree Features for Text Representation" + * @see http://web.engr.illinois.edu/~massung1/files/icsc-2013.pdf */ class semi_skeleton_featurizer : public util::clonable diff --git a/include/parser/analyzers/featurizers/skeleton_featurizer.h b/include/parser/analyzers/featurizers/skeleton_featurizer.h index 4d0979fe0..3b99f9954 100644 --- a/include/parser/analyzers/featurizers/skeleton_featurizer.h +++ b/include/parser/analyzers/featurizers/skeleton_featurizer.h @@ -20,6 +20,9 @@ namespace analyzers /** * Tokenizes parse trees by only tokenizing the tree structure itself. + * @see Sean Massung, ChengXiang Zhai, and Julia Hockenmaier. 2013. "Structural + * Parse Tree Features for Text Representation" + * @see http://web.engr.illinois.edu/~massung1/files/icsc-2013.pdf */ class skeleton_featurizer : public util::clonable From f6469830034d30ba34feeceefa3ceb355f500198 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 11:13:08 -0500 Subject: [PATCH 226/481] update comments in lm::sentence re: #104 --- include/lm/sentence.h | 50 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/include/lm/sentence.h b/include/lm/sentence.h index 25b73fdf6..f35126585 100644 --- a/include/lm/sentence.h +++ b/include/lm/sentence.h @@ -15,6 +15,14 @@ namespace meta { namespace lm { +/** + * A sequence of tokens that represents a sentence. Tokens are stored in a list + * format to enable operations such as insert, substitute, and remove. If an + * edit is performed, it is remembered as part of an ordered sequence of + * operations. Further, different weights may be assigned to any arbitrary edit + * operation, and these weights may also be returned as an ordered sequence. + * @see Useful in conjunction with lm::diff + */ class sentence { public: @@ -46,13 +54,18 @@ class sentence */ const std::string& operator[](size_type idx) const; + /** + * Slicing/substring operator + * @param from index of left side of sentence + * @param to index of right side of sentence + */ sentence operator()(size_type from, size_type to) const; /** + * Replace the token at the specified index with the provided token * @param idx * @param token * @param weight The weight that this edit carries - * @return replace the token at the specified index with the provided token */ void substitute(size_type idx, const std::string& token, double weight = 0.0); @@ -78,29 +91,57 @@ class sentence /** * @return the sequence of edit weights to this sentence + * @see useful in conjunction with lm::diff */ std::vector weights() const; /** - * @return the operations (edits) performed on this sentence + * @return the string representations of the operations (edits) performed on + * this sentence */ const std::vector& operations() const; + /** + * @return the token at the front of the sentence + */ const std::string& front() const; + /** + * @return the token at the end of the sentence + */ const std::string& back() const; + /** + * Inserts a token at the beginning of the sentence + * @param token The token to insert + */ void push_front(const std::string& token); + /** + * Remove the token at the beginning of the sentence + */ void pop_front(); + /** + * Inserts a token at the end of the sentence + * @param token The token to insert + */ void push_back(const std::string& token); + /** + * Remove the token at the end of the sentence + */ void pop_back(); + /** + * Emplaces a token at the beginning of the sentence + */ template void emplace_front(Args&&... args); + /** + * Emplaces a token at the end of the sentence + */ template void emplace_back(Args&&... args); @@ -130,8 +171,13 @@ class sentence size_type size() const; private: + /// The tokens (words) in the sentence std::deque tokens_; + + /// String representations of the sequence of edit oeprations performed std::vector ops_; + + /// Ordered sequence of edit weights std::vector weights_; }; From 317520114cf2a2539a0b45c521cf943d94129ce8 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 11:15:07 -0500 Subject: [PATCH 227/481] clearer feature name in diff_analyzer re: #107 --- src/lm/analyzers/diff_analyzer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lm/analyzers/diff_analyzer.cpp b/src/lm/analyzers/diff_analyzer.cpp index 46ae65c42..6c7de999d 100644 --- a/src/lm/analyzers/diff_analyzer.cpp +++ b/src/lm/analyzers/diff_analyzer.cpp @@ -67,7 +67,7 @@ void diff_analyzer::tokenize(corpus::document& doc) } catch (lm::sentence_exception& ex) { - doc.increment("error", 1); + doc.increment("no-candidates", 1); } } From 6d963cafe24f067f19efa1be47db0fe4ff06fa71 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 11:19:18 -0500 Subject: [PATCH 228/481] complexity comment re: #107 --- include/lm/language_model.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/lm/language_model.h b/include/lm/language_model.h index f2bf43455..2b954adc8 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -83,6 +83,8 @@ class language_model * @param prev Seen tokens to base the next token off of * @param k Number of results to return * @return a sorted vector of likely next tokens + * Complexity is currently O(|V|) due to the LM structure; this should be + * changed in a future version of MeTA. */ std::vector> top_k(const sentence& prev, size_t k) const; From e9389f3829c8797fbecdf79c2a7d9205e38cc65e Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 11:27:13 -0500 Subject: [PATCH 229/481] change isnan to isfinite to also check for inf vals in info gain --- src/features/information_gain.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/features/information_gain.cpp b/src/features/information_gain.cpp index b101bbfcc..cf56e5f98 100644 --- a/src/features/information_gain.cpp +++ b/src/features/information_gain.cpp @@ -29,13 +29,13 @@ double information_gain::score(label_id lid, term_id tid) const double gain_tnc = p_tnc * std::log(p_tnc / (p_t * p_nc)); // if any denominators were zero, make the expression zero - if (std::isnan(gain_tc)) + if (std::isfinite(gain_tc)) gain_tc = 0.0; - if (std::isnan(gain_ntnc)) + if (std::isfinite(gain_ntnc)) gain_ntnc = 0.0; - if (std::isnan(gain_ntc)) + if (std::isfinite(gain_ntc)) gain_ntc = 0.0; - if (std::isnan(gain_tnc)) + if (std::isfinite(gain_tnc)) gain_tnc = 0.0; return gain_tc + gain_ntnc + gain_ntc + gain_tnc; From 14b2edc217ffc20d0d720343aec4d9fe3c71327b Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 11:32:08 -0500 Subject: [PATCH 230/481] reserve space in diff's candidates return vector --- src/lm/diff.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 12d0363e3..7017356ed 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -71,6 +71,7 @@ std::vector> step(sent, candidates, 0); std::vector sorted; + sorted.reserve(candidates.size()); while (!candidates.empty()) { sorted.emplace_back(std::move(candidates.top())); From baecee281e1d20506dded290216059d9bcab7738 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 11:33:25 -0500 Subject: [PATCH 231/481] improve comment in diff --- include/lm/diff.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index 66f582084..b19c2c0f3 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -66,7 +66,8 @@ class diff /** * @param sent The sentence object to inspect - * @return the index of the least-likely ngram according to perplexity + * @return the index of the last word in the least-likely ngram according to + * perplexity * Runtime is linear in the sentence length, since log_prob is called on * each ngram in the sentence. */ From 5cae75d240fb8e30a06088656c588ede53137e75 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 6 Sep 2015 14:23:50 -0500 Subject: [PATCH 232/481] fix bug with inverted logic in information_gain.cpp --- src/features/information_gain.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/features/information_gain.cpp b/src/features/information_gain.cpp index cf56e5f98..d1c2d6932 100644 --- a/src/features/information_gain.cpp +++ b/src/features/information_gain.cpp @@ -29,13 +29,13 @@ double information_gain::score(label_id lid, term_id tid) const double gain_tnc = p_tnc * std::log(p_tnc / (p_t * p_nc)); // if any denominators were zero, make the expression zero - if (std::isfinite(gain_tc)) + if (!std::isfinite(gain_tc)) gain_tc = 0.0; - if (std::isfinite(gain_ntnc)) + if (!std::isfinite(gain_ntnc)) gain_ntnc = 0.0; - if (std::isfinite(gain_ntc)) + if (!std::isfinite(gain_ntc)) gain_ntc = 0.0; - if (std::isfinite(gain_tnc)) + if (!std::isfinite(gain_tnc)) gain_tnc = 0.0; return gain_tc + gain_ntnc + gain_ntc + gain_tnc; From 4cbb75bf3b7c6b646e01608b7bab1680230c1883 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 6 Sep 2015 16:20:43 -0500 Subject: [PATCH 233/481] Add optional::value_or() functions. --- include/util/optional.h | 16 ++++++++++++++++ include/util/optional.tcc | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/util/optional.h b/include/util/optional.h index a85e83064..16c92465b 100644 --- a/include/util/optional.h +++ b/include/util/optional.h @@ -194,6 +194,22 @@ class optional : public util::comparable> */ void clear(); + /** + * @param default_value The value to return if this optional is empty + * @return the contained value if there is on, or default_value + * otherwise + */ + template + T value_or(U&& default_value) const&; + + /** + * @param default_value The value to return if this optional is empty + * @return the contained value if there is on, or default_value + * otherwise + */ + template + T value_or(U&& default_value) &&; + private: /** * Helper function to obtain the address of the contained value. diff --git a/include/util/optional.tcc b/include/util/optional.tcc index 17018fbef..8a2d9cb69 100644 --- a/include/util/optional.tcc +++ b/include/util/optional.tcc @@ -147,6 +147,22 @@ void optional::clear() initialized_ = false; } +template +template +T optional::value_or(U&& default_value) const & +{ + return bool(*this) ? **this + : static_cast(std::forward(default_value)); +} + +template +template +T optional::value_or(U&& default_value) && +{ + return bool(*this) ? std::move(**this) + : static_cast(std::forward(default_value)); +} + template const T* optional::dataptr() const { From de139831eede07846087aceb66e8c72dd4ea5c53 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 6 Sep 2015 16:36:28 -0500 Subject: [PATCH 234/481] Use std::experimental::optional/string_view when available. Otherwise, fall back on our own implementations, which we should eventually get rid of when these become standardized and readily available in compilers. --- CMakeLists.txt | 24 ++++++++++++++++++++++++ include/index/vocabulary_map.h | 10 +--------- include/util/optional.h | 20 +++++++++++++++++--- include/util/string_view.h | 20 +++++++++++++++++++- 4 files changed, 61 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 564afa551..baf89bd6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -222,6 +222,30 @@ if(META_HAS_STD_MAKE_UNIQUE) -DMETA_HAS_STD_MAKE_UNIQUE) endif() +check_cxx_source_compiles(" +#include +int main() { + std::experimental::optional x; + return 0; +}" META_HAS_EXPERIMENTAL_OPTIONAL) + +if (META_HAS_EXPERIMENTAL_OPTIONAL) + target_compile_definitions(meta-definitions INTERFACE + -DMETA_HAS_EXPERIMENTAL_OPTIONAL) +endif() + +check_cxx_source_compiles(" +#include +int main() { + std::experimental::string_view sv = \"hello world\"; + return 0; +}" META_HAS_EXPERIMENTAL_STRING_VIEW) + +if (META_HAS_EXPERIMENTAL_STRING_VIEW) + target_compile_definitions(meta-definitions INTERFACE + -DMETA_HAS_EXPERIMENTAL_STRING_VIEW) +endif() + # work around a bug in libstdc++ provided with gcc < 4.8.3 where a static # assertion fires when you have a non-empty hash functor check_cxx_source_compiles(" diff --git a/include/index/vocabulary_map.h b/include/index/vocabulary_map.h index 1976f4f6a..c4bf443d6 100644 --- a/include/index/vocabulary_map.h +++ b/include/index/vocabulary_map.h @@ -12,15 +12,7 @@ #include "io/mmap_file.h" #include "util/disk_vector.h" - -namespace meta -{ -namespace util -{ -template -class optional; -} -} +#include "util/optional.h" namespace meta { diff --git a/include/util/optional.h b/include/util/optional.h index 16c92465b..f1b8a485d 100644 --- a/include/util/optional.h +++ b/include/util/optional.h @@ -7,9 +7,22 @@ * project. */ -#ifndef META_OPTIONAL_H_ -#define META_OPTIONAL_H_ +#ifndef META_UTIL_OPTIONAL_H_ +#define META_UTIL_OPTIONAL_H_ +#if META_HAS_EXPERIMENTAL_OPTIONAL +#include +namespace meta +{ +namespace util +{ +template +using optional = std::experimental::optional; + +using std::experimental::nullopt; +} +} +#else #include #include #include "util/comparable.h" @@ -254,4 +267,5 @@ class bad_optional_access : public std::runtime_error } #include "util/optional.tcc" -#endif +#endif // !META_HAS_EXPERIMENTAL_OPTIONAL +#endif // META_UTIL_OPTIONAL_H_ diff --git a/include/util/string_view.h b/include/util/string_view.h index 92084d433..0150674e2 100644 --- a/include/util/string_view.h +++ b/include/util/string_view.h @@ -10,6 +10,23 @@ #ifndef META_UTIL_STRING_VIEW_H_ #define META_UTIL_STRING_VIEW_H_ +#if META_HAS_EXPERIMENTAL_STRING_VIEW +#include +namespace meta +{ +namespace util +{ +template > +using basic_string_view = std::experimental::basic_string_view; + +using string_view = basic_string_view; +using u16string_view = basic_string_view; +using u32string_view = basic_string_view; +using wstring_view = basic_string_view; +} +} +#else + #include #include #include @@ -629,4 +646,5 @@ struct hash> } }; } -#endif +#endif // !META_HAS_EXPERIMENTAL_STRING_VIEW +#endif // META_UTIL_STRING_VIEW_H_ From 8382725ce0f063aaa506c717f5afb321e268aaac Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 6 Sep 2015 16:37:14 -0500 Subject: [PATCH 235/481] Get rid of maybe uninitialized warning in gcc. This variable will always be set within the loop before use, so I just picked an arbitrary value of the same type to initialize. --- include/graph/algorithms/search.tcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/graph/algorithms/search.tcc b/include/graph/algorithms/search.tcc index 4cb084836..ee24c2622 100644 --- a/include/graph/algorithms/search.tcc +++ b/include/graph/algorithms/search.tcc @@ -23,7 +23,7 @@ std::vector myopic_search(Graph& g, node_id src, node_id dest) { if (path.size() > g.size()) throw graph_algorithm_exception{"no path found in myopic search"}; - node_id best_id; + node_id best_id{cur}; double best_distance = std::numeric_limits::max(); for (auto& n : g.adjacent(cur)) { From aa30e5b66b51211f8e821e4c493ec13feeb6db7e Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 6 Sep 2015 16:38:54 -0500 Subject: [PATCH 236/481] Silence maybe uninitialized warning on GCC. --- include/graph/algorithms/search.tcc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/graph/algorithms/search.tcc b/include/graph/algorithms/search.tcc index ee24c2622..df5c91fed 100644 --- a/include/graph/algorithms/search.tcc +++ b/include/graph/algorithms/search.tcc @@ -6,6 +6,7 @@ #include #include #include +#include "util/optional.h" namespace meta { @@ -23,7 +24,7 @@ std::vector myopic_search(Graph& g, node_id src, node_id dest) { if (path.size() > g.size()) throw graph_algorithm_exception{"no path found in myopic search"}; - node_id best_id{cur}; + util::optional best_id; double best_distance = std::numeric_limits::max(); for (auto& n : g.adjacent(cur)) { @@ -36,7 +37,7 @@ std::vector myopic_search(Graph& g, node_id src, node_id dest) } } - cur = best_id; + cur = *best_id; path.push_back(cur); } From b9b2d21f7307c92e4258063ddf45a464fb0eb7da Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 5 Sep 2015 16:53:22 -0500 Subject: [PATCH 237/481] Move static functions in analyzer to free functions. --- include/analyzers/analyzer.h | 90 +++++++++---------- src/analyzers/analyzer.cpp | 33 +++---- src/analyzers/ngram/ngram_word_analyzer.cpp | 2 +- src/analyzers/tools/tokenize_test.cpp | 2 +- src/index/forward_index.cpp | 2 +- src/index/inverted_index.cpp | 7 +- src/lm/language_model.cpp | 2 +- src/parser/analyzers/tree_analyzer.cpp | 2 +- src/sequence/analyzers/ngram_pos_analyzer.cpp | 2 +- src/test/analyzer_test.cpp | 2 +- 10 files changed, 71 insertions(+), 73 deletions(-) diff --git a/include/analyzers/analyzer.h b/include/analyzers/analyzer.h index 234a8edb4..54450bafc 100644 --- a/include/analyzers/analyzer.h +++ b/include/analyzers/analyzer.h @@ -57,52 +57,6 @@ class analyzer */ virtual std::unique_ptr clone() const = 0; - /** - * @param config The config group used to create the analyzer from - * @return an analyzer as specified by a config object - */ - static std::unique_ptr load(const cpptoml::table& config); - - /** - * @param config The config group used to create the analyzer from - * @return the default filter chain for this version of MeTA, - * based on a config object - */ - static std::unique_ptr - default_filter_chain(const cpptoml::table& config); - - /** - * @param config The config group used to create the analyzer from - * @return the default filter chain for unigram words for this version - * of MeTA, based on a config object - */ - static std::unique_ptr - default_unigram_chain(const cpptoml::table& config); - - /** - * @param global The original config object with all parameters - * @param config The config group used to create the filters from - * @return a filter chain as specified by a config object - */ - static std::unique_ptr - load_filters(const cpptoml::table& global, - const cpptoml::table& config); - - /** - * @param src The token stream that will feed into this filter - * @param config The config group used to create the filter from - * @return a single filter specified by a config object - */ - static std::unique_ptr - load_filter(std::unique_ptr src, - const cpptoml::table& config); - - /** - * @param doc The document to get content for - * @return the contents of the document, as a string - */ - static std::string get_content(const corpus::document& doc); - public: /** * Basic exception for analyzer interactions. @@ -113,6 +67,50 @@ class analyzer using std::runtime_error::runtime_error; }; }; + +/** + * @param config The config group used to create the analyzer from + * @return an analyzer as specified by a config object + */ +std::unique_ptr load(const cpptoml::table& config); + +/** + * @param config The config group used to create the analyzer from + * @return the default filter chain for this version of MeTA, + * based on a config object + */ +std::unique_ptr + default_filter_chain(const cpptoml::table& config); + +/** + * @param config The config group used to create the analyzer from + * @return the default filter chain for unigram words for this version + * of MeTA, based on a config object + */ +std::unique_ptr + default_unigram_chain(const cpptoml::table& config); + +/** + * @param global The original config object with all parameters + * @param config The config group used to create the filters from + * @return a filter chain as specified by a config object + */ +std::unique_ptr load_filters(const cpptoml::table& global, + const cpptoml::table& config); + +/** + * @param src The token stream that will feed into this filter + * @param config The config group used to create the filter from + * @return a single filter specified by a config object + */ +std::unique_ptr load_filter(std::unique_ptr src, + const cpptoml::table& config); + +/** + * @param doc The document to get content for + * @return the contents of the document, as a string + */ +std::string get_content(const corpus::document& doc); } } #endif diff --git a/src/analyzers/analyzer.cpp b/src/analyzers/analyzer.cpp index 21c42fdf4..ef87263ec 100644 --- a/src/analyzers/analyzer.cpp +++ b/src/analyzers/analyzer.cpp @@ -24,10 +24,10 @@ namespace meta namespace analyzers { -std::string analyzer::get_content(const corpus::document& doc) +std::string get_content(const corpus::document& doc) { if (!doc.contains_content()) - throw analyzer_exception{ + throw analyzer::analyzer_exception{ "document content was not populated for analysis"}; return utf::to_utf8(doc.content(), doc.encoding()); @@ -52,8 +52,7 @@ std::unique_ptr } } -std::unique_ptr - analyzer::default_filter_chain(const cpptoml::table& config) +std::unique_ptr default_filter_chain(const cpptoml::table& config) { auto tokenizer = make_unique(); auto result = add_default_filters(std::move(tokenizer), config); @@ -62,26 +61,25 @@ std::unique_ptr } std::unique_ptr - analyzer::default_unigram_chain(const cpptoml::table& config) + default_unigram_chain(const cpptoml::table& config) { // suppress "", "" auto tokenizer = make_unique(true); return add_default_filters(std::move(tokenizer), config); } -std::unique_ptr - analyzer::load_filter(std::unique_ptr src, - const cpptoml::table& config) +std::unique_ptr load_filter(std::unique_ptr src, + const cpptoml::table& config) { auto type = config.get_as("type"); if (!type) - throw analyzer_exception{"filter type missing in config file"}; + throw analyzer::analyzer_exception{ + "filter type missing in config file"}; return filter_factory::get().create(*type, std::move(src), config); } -std::unique_ptr - analyzer::load_filters(const cpptoml::table& global, - const cpptoml::table& config) +std::unique_ptr load_filters(const cpptoml::table& global, + const cpptoml::table& config) { auto check = config.get_as("filter"); @@ -92,19 +90,21 @@ std::unique_ptr else if (*check == "default-unigram-chain") return default_unigram_chain(global); else - throw analyzer_exception{"unknown filter option: " + *check}; + throw analyzer::analyzer_exception{"unknown filter option: " + + *check}; } auto filters = config.get_table_array("filter"); if (!filters) - throw analyzer_exception{"analyzer group missing filter configuration"}; + throw analyzer::analyzer_exception{ + "analyzer group missing filter configuration"}; std::unique_ptr result; for (const auto filter : filters->get()) result = load_filter(std::move(result), *filter); return result; } -std::unique_ptr analyzer::load(const cpptoml::table& config) +std::unique_ptr load(const cpptoml::table& config) { using namespace analyzers; std::vector> toks; @@ -113,7 +113,8 @@ std::unique_ptr analyzer::load(const cpptoml::table& config) { auto method = group->get_as("method"); if (!method) - throw analyzer_exception{"failed to find analyzer method"}; + throw analyzer::analyzer_exception{ + "failed to find analyzer method"}; toks.emplace_back( analyzer_factory::get().create(*method, config, *group)); } diff --git a/src/analyzers/ngram/ngram_word_analyzer.cpp b/src/analyzers/ngram/ngram_word_analyzer.cpp index 6550c124e..eb068e51d 100644 --- a/src/analyzers/ngram/ngram_word_analyzer.cpp +++ b/src/analyzers/ngram/ngram_word_analyzer.cpp @@ -60,7 +60,7 @@ std::unique_ptr throw analyzer::analyzer_exception{ "ngram size needed for ngram word analyzer in config file"}; - auto filts = analyzer::load_filters(global, config); + auto filts = load_filters(global, config); return make_unique(*n_val, std::move(filts)); } } diff --git a/src/analyzers/tools/tokenize_test.cpp b/src/analyzers/tools/tokenize_test.cpp index 975fd7887..b8f79057a 100644 --- a/src/analyzers/tools/tokenize_test.cpp +++ b/src/analyzers/tools/tokenize_test.cpp @@ -33,7 +33,7 @@ int main(int argc, char** argv) if (*method == analyzers::ngram_word_analyzer::id) { - stream = analyzers::analyzer::load_filters(config, *group); + stream = analyzers::load_filters(config, *group); break; } } diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 9d5ea157d..6f75d0404 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -231,7 +231,7 @@ void forward_index::create_index(const std::string& config_file) auto docs = corpus::corpus::load(config_file); { - auto analyzer = analyzers::analyzer::load(config); + auto analyzer = analyzers::load(config); metadata_writer mdata_writer{index_name(), docs->size(), docs->schema()}; diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index dc270c913..0363b53a8 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -76,9 +76,7 @@ class inverted_index::impl }; inverted_index::impl::impl(inverted_index* idx, const cpptoml::table& config) - : idx_{idx}, - analyzer_{analyzers::analyzer::load(config)}, - total_corpus_terms_{0} + : idx_{idx}, analyzer_{analyzers::load(config)}, total_corpus_terms_{0} { // nothing } @@ -142,7 +140,8 @@ void inverted_index::create_index(const std::string& config_file) LOG(info) << "Created uncompressed postings file " << index_name() << impl_->files[POSTINGS] << " (" - << printing::bytes_to_units(inverter.final_size()) << ")" << ENDLG; + << printing::bytes_to_units(inverter.final_size()) << ")" + << ENDLG; uint64_t num_unique_terms = inverter.unique_primary_keys(); inv_impl_->compress(index_name() + impl_->files[POSTINGS], diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 8d0d5cf97..b70bc8e11 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -66,7 +66,7 @@ void language_model::learn_model(const std::string& config_file) while (corpus->has_next()) { auto doc = corpus->next(); - stream->set_content(analyzer::get_content(doc)); + stream->set_content(analyzers::get_content(doc)); // get ngram stream started std::deque ngram; diff --git a/src/parser/analyzers/tree_analyzer.cpp b/src/parser/analyzers/tree_analyzer.cpp index b418f120a..c59bbba2e 100644 --- a/src/parser/analyzers/tree_analyzer.cpp +++ b/src/parser/analyzers/tree_analyzer.cpp @@ -92,7 +92,7 @@ std::unique_ptr throw analyzer::analyzer_exception{ "tree analyzer needs an array of features to generate"}; - auto filts = analyzer::load_filters(global, config); + auto filts = load_filters(global, config); auto ana = make_unique(std::move(filts), *tagger_prefix, *parser_prefix); diff --git a/src/sequence/analyzers/ngram_pos_analyzer.cpp b/src/sequence/analyzers/ngram_pos_analyzer.cpp index c35b7beba..8d61a64ba 100644 --- a/src/sequence/analyzers/ngram_pos_analyzer.cpp +++ b/src/sequence/analyzers/ngram_pos_analyzer.cpp @@ -101,7 +101,7 @@ std::unique_ptr throw analyzer::analyzer_exception{ "ngram-pos analyzer must contain a prefix to a crf model"}; - auto filts = analyzer::load_filters(global, config); + auto filts = load_filters(global, config); return make_unique(*n_val, std::move(filts), *crf_prefix); } diff --git a/src/test/analyzer_test.cpp b/src/test/analyzer_test.cpp index ee75d6b5e..58e26027f 100644 --- a/src/test/analyzer_test.cpp +++ b/src/test/analyzer_test.cpp @@ -21,7 +21,7 @@ std::unique_ptr make_filter() using namespace analyzers; create_config("line"); auto config = cpptoml::parse_file("test-config.toml"); - return analyzers::analyzer::default_filter_chain(config); + return analyzers::default_filter_chain(config); } } From eb5c620ba6d1308a8a4160ea02cd6903ced95c40 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 5 Sep 2015 16:58:49 -0500 Subject: [PATCH 238/481] Extract analyzer_exception from analyzer class. This will make things easier when analyzer becomes analyzer. --- include/analyzers/analyzer.h | 15 +++++++-------- include/parser/analyzers/tree_analyzer.h | 6 ++++-- src/analyzers/analyzer.cpp | 14 +++++--------- src/analyzers/ngram/ngram_word_analyzer.cpp | 2 +- src/parser/analyzers/tree_analyzer.cpp | 14 ++++---------- src/sequence/analyzers/ngram_pos_analyzer.cpp | 4 ++-- 6 files changed, 23 insertions(+), 32 deletions(-) diff --git a/include/analyzers/analyzer.h b/include/analyzers/analyzer.h index 54450bafc..369300df8 100644 --- a/include/analyzers/analyzer.h +++ b/include/analyzers/analyzer.h @@ -56,16 +56,15 @@ class analyzer * Clones this analyzer. */ virtual std::unique_ptr clone() const = 0; +}; +/** + * Basic exception for analyzer interactions. + */ +class analyzer_exception : public std::runtime_error +{ public: - /** - * Basic exception for analyzer interactions. - */ - class analyzer_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using std::runtime_error::runtime_error; }; /** diff --git a/include/parser/analyzers/tree_analyzer.h b/include/parser/analyzers/tree_analyzer.h index 41614a425..035705d6d 100644 --- a/include/parser/analyzers/tree_analyzer.h +++ b/include/parser/analyzers/tree_analyzer.h @@ -74,11 +74,13 @@ class tree_analyzer : public util::clonable const static std::string id; private: + using tree_featurizer_list + = std::vector>; + /** * A list of tree_featurizers to run on each parse tree. */ - std::shared_ptr>> - featurizers_; + std::shared_ptr featurizers_; /** * The token stream for extracting tokens. diff --git a/src/analyzers/analyzer.cpp b/src/analyzers/analyzer.cpp index ef87263ec..224eef9fb 100644 --- a/src/analyzers/analyzer.cpp +++ b/src/analyzers/analyzer.cpp @@ -27,7 +27,7 @@ namespace analyzers std::string get_content(const corpus::document& doc) { if (!doc.contains_content()) - throw analyzer::analyzer_exception{ + throw analyzer_exception{ "document content was not populated for analysis"}; return utf::to_utf8(doc.content(), doc.encoding()); @@ -73,8 +73,7 @@ std::unique_ptr load_filter(std::unique_ptr src, { auto type = config.get_as("type"); if (!type) - throw analyzer::analyzer_exception{ - "filter type missing in config file"}; + throw analyzer_exception{"filter type missing in config file"}; return filter_factory::get().create(*type, std::move(src), config); } @@ -90,14 +89,12 @@ std::unique_ptr load_filters(const cpptoml::table& global, else if (*check == "default-unigram-chain") return default_unigram_chain(global); else - throw analyzer::analyzer_exception{"unknown filter option: " - + *check}; + throw analyzer_exception{"unknown filter option: " + *check}; } auto filters = config.get_table_array("filter"); if (!filters) - throw analyzer::analyzer_exception{ - "analyzer group missing filter configuration"}; + throw analyzer_exception{"analyzer group missing filter configuration"}; std::unique_ptr result; for (const auto filter : filters->get()) result = load_filter(std::move(result), *filter); @@ -113,8 +110,7 @@ std::unique_ptr load(const cpptoml::table& config) { auto method = group->get_as("method"); if (!method) - throw analyzer::analyzer_exception{ - "failed to find analyzer method"}; + throw analyzer_exception{"failed to find analyzer method"}; toks.emplace_back( analyzer_factory::get().create(*method, config, *group)); } diff --git a/src/analyzers/ngram/ngram_word_analyzer.cpp b/src/analyzers/ngram/ngram_word_analyzer.cpp index eb068e51d..545fbc8ec 100644 --- a/src/analyzers/ngram/ngram_word_analyzer.cpp +++ b/src/analyzers/ngram/ngram_word_analyzer.cpp @@ -57,7 +57,7 @@ std::unique_ptr { auto n_val = config.get_as("ngram"); if (!n_val) - throw analyzer::analyzer_exception{ + throw analyzer_exception{ "ngram size needed for ngram word analyzer in config file"}; auto filts = load_filters(global, config); diff --git a/src/parser/analyzers/tree_analyzer.cpp b/src/parser/analyzers/tree_analyzer.cpp index c59bbba2e..d57236b5f 100644 --- a/src/parser/analyzers/tree_analyzer.cpp +++ b/src/parser/analyzers/tree_analyzer.cpp @@ -19,11 +19,7 @@ const std::string tree_analyzer::id = "tree"; tree_analyzer::tree_analyzer(std::unique_ptr stream, const std::string& tagger_prefix, const std::string& parser_prefix) - : featurizers_{ - std:: - make_shared>>()}, + : featurizers_{std::make_shared()}, stream_{std::move(stream)}, tagger_{std::make_shared(tagger_prefix)}, parser_{std::make_shared(parser_prefix)} @@ -79,17 +75,15 @@ std::unique_ptr { auto tagger_prefix = config.get_as("tagger"); if (!tagger_prefix) - throw analyzer::analyzer_exception{ - "tree analyzer requires a tagger directory"}; + throw analyzer_exception{"tree analyzer requires a tagger directory"}; auto parser_prefix = config.get_as("parser"); if (!parser_prefix) - throw analyzer::analyzer_exception{ - "tree analyzer requires a parser directory"}; + throw analyzer_exception{"tree analyzer requires a parser directory"}; auto feat_arr = config.get_array("features"); if (!feat_arr) - throw analyzer::analyzer_exception{ + throw analyzer_exception{ "tree analyzer needs an array of features to generate"}; auto filts = load_filters(global, config); diff --git a/src/sequence/analyzers/ngram_pos_analyzer.cpp b/src/sequence/analyzers/ngram_pos_analyzer.cpp index 8d61a64ba..9cac3ed2e 100644 --- a/src/sequence/analyzers/ngram_pos_analyzer.cpp +++ b/src/sequence/analyzers/ngram_pos_analyzer.cpp @@ -93,12 +93,12 @@ std::unique_ptr { auto n_val = config.get_as("ngram"); if (!n_val) - throw analyzer::analyzer_exception{ + throw analyzer_exception{ "ngram size needed for ngram pos analyzer in config file"}; auto crf_prefix = config.get_as("crf-prefix"); if (!crf_prefix) - throw analyzer::analyzer_exception{ + throw analyzer_exception{ "ngram-pos analyzer must contain a prefix to a crf model"}; auto filts = load_filters(global, config); From 81f07f22ba4076a10d2bb7f100a68d05e8502932 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 5 Sep 2015 18:15:47 -0500 Subject: [PATCH 239/481] Change analyzer to analyzer. This hasn't changed functionality much at all, yet, but prepares us for changing the behavior of tokenize(doc) to depend on T. This required some changing to how the analyzer factory worked to allow for partial specialization in the factory functions (you can partially specialize classes, but not functions). The change isn't too onerous, so I'm not too worried about it. The extern template and explicit template instantiation will need to be explained in the documentation more, but I don't think it's too confusing: essentially there are only ever two possible instantiations of your class, so we don't actually want to header-ize everything and re-compile the whole analyzer every time it's used. Instead, we force everything into a library file, despite the fact that it's a template, because we can explicitly instantiate our two cases at the end of the file. --- include/analyzers/analyzer.h | 24 ++++++++++- include/analyzers/analyzer_factory.h | 38 +++++++++++++---- include/analyzers/multi_analyzer.h | 11 +++-- include/analyzers/ngram/ngram_analyzer.h | 7 +++- include/analyzers/ngram/ngram_word_analyzer.h | 25 +++++++---- include/parser/analyzers/tree_analyzer.h | 20 ++++++--- .../sequence/analyzers/ngram_pos_analyzer.h | 26 ++++++++---- src/analyzers/analyzer.cpp | 14 +++++-- src/analyzers/analyzer_factory.cpp | 14 +++++-- src/analyzers/multi_analyzer.cpp | 16 +++++-- src/analyzers/ngram/ngram_analyzer.cpp | 14 +++++-- src/analyzers/ngram/ngram_word_analyzer.cpp | 30 ++++++++----- src/analyzers/tools/tokenize_test.cpp | 2 +- src/index/forward_index.cpp | 6 +-- src/index/inverted_index.cpp | 6 ++- src/parser/analyzers/tree_analyzer.cpp | 40 +++++++++++------- src/sequence/analyzers/ngram_pos_analyzer.cpp | 42 ++++++++++++------- src/test/analyzer_test.cpp | 12 +++--- src/tools/profile.cpp | 2 +- 19 files changed, 244 insertions(+), 105 deletions(-) diff --git a/include/analyzers/analyzer.h b/include/analyzers/analyzer.h index 369300df8..dec447d3c 100644 --- a/include/analyzers/analyzer.h +++ b/include/analyzers/analyzer.h @@ -37,10 +37,26 @@ class token_stream; /** * An class that provides a framework to produce token counts from documents. * All analyzers inherit from this class and (possibly) implement tokenize(). + * + * The template argument for an analyzer indicates the supported feature + * value for the analyzer, which is either uint64_t for inverted_index or + * double for forward_index. + * + * When defining your own sublcass of analyzer, you should ensure to + * subclass from the appropriate type. */ +template class analyzer { public: + static_assert(std::is_same::value + || std::is_same::value, + "analyzers can only produce unsigned integer or real valued " + "feature values"); + + using base_type = analyzer; + using feature_value_type = T; + /** * A default virtual destructor. */ @@ -71,7 +87,13 @@ class analyzer_exception : public std::runtime_error * @param config The config group used to create the analyzer from * @return an analyzer as specified by a config object */ -std::unique_ptr load(const cpptoml::table& config); +template +std::unique_ptr> load(const cpptoml::table& config); + +extern template std::unique_ptr> + load(const cpptoml::table& config); +extern template std::unique_ptr> + load(const cpptoml::table& config); /** * @param config The config group used to create the analyzer from diff --git a/include/analyzers/analyzer_factory.h b/include/analyzers/analyzer_factory.h index 90467cd9a..db1e97778 100644 --- a/include/analyzers/analyzer_factory.h +++ b/include/analyzers/analyzer_factory.h @@ -29,10 +29,14 @@ namespace analyzers * files. Clients should use the register_analyzer method instead of this * class directly. */ +template class analyzer_factory - : public util::factory + : public util::factory, analyzer, + const cpptoml::table&, const cpptoml::table&> { + using base_factory = typename analyzer_factory::base_factory; + using factory_method = typename base_factory::factory_method; + /// friend the base class friend base_factory; @@ -53,15 +57,32 @@ class analyzer_factory std::unordered_map methods_; }; +extern template class analyzer_factory; +extern template class analyzer_factory; + +/** + * Traits class for analyzers. You should specialize this class if you need + * to customize creation behavior for your analyzer class. This is a class + * template to allow for partial specializations as well. + */ +template +struct analyzer_traits +{ + static std::unique_ptr + create(const cpptoml::table&, const cpptoml::table&) + { + return make_unique(); + } +}; + /** - * Factory method for creating an analyzer. This should be specialized if - * your given tokenizer requires special construction behavior. + * Factory method for creating an analyzer. */ template -std::unique_ptr make_analyzer(const cpptoml::table&, - const cpptoml::table&) +std::unique_ptr + make_analyzer(const cpptoml::table& global, const cpptoml::table& config) { - return make_unique(); + return analyzer_traits::create(global, config); } /** @@ -71,7 +92,8 @@ std::unique_ptr make_analyzer(const cpptoml::table&, template void register_analyzer() { - analyzer_factory::get().add(Analyzer::id, make_analyzer); + analyzer_factory::get().add( + Analyzer::id, make_analyzer); } } } diff --git a/include/analyzers/multi_analyzer.h b/include/analyzers/multi_analyzer.h index 9f1a7b84d..2ae6021ad 100644 --- a/include/analyzers/multi_analyzer.h +++ b/include/analyzers/multi_analyzer.h @@ -29,14 +29,15 @@ namespace analyzers * rewrite rules. The multi_analyzer keeps track of all the features in one set * for however many internal analyzers it contains. */ -class multi_analyzer : public util::clonable +template +class multi_analyzer : public util::clonable, multi_analyzer> { public: /** * Constructs a multi_analyzer from a vector of other analyzers. * @param toks A vector of analyzers to combine features from */ - multi_analyzer(std::vector>&& toks); + multi_analyzer(std::vector>>&& toks); /** * Copy constructor. @@ -52,9 +53,11 @@ class multi_analyzer : public util::clonable private: /// Holds all the analyzers in this multi_analyzer - std::vector> analyzers_; + std::vector>> analyzers_; }; + +extern template class multi_analyzer; +extern template class multi_analyzer; } } - #endif diff --git a/include/analyzers/ngram/ngram_analyzer.h b/include/analyzers/ngram/ngram_analyzer.h index b4a8f5c90..484faf584 100644 --- a/include/analyzers/ngram/ngram_analyzer.h +++ b/include/analyzers/ngram/ngram_analyzer.h @@ -24,7 +24,8 @@ namespace analyzers * supplied by the user. This class is abstract, as it only provides the * framework for ngram tokenization. */ -class ngram_analyzer : public analyzer +template +class ngram_analyzer : public analyzer { public: /** @@ -50,7 +51,9 @@ class ngram_analyzer : public analyzer /// The value of n for this ngram analyzer uint16_t n_val_; }; + +extern template class ngram_analyzer; +extern template class ngram_analyzer; } } - #endif diff --git a/include/analyzers/ngram/ngram_word_analyzer.h b/include/analyzers/ngram/ngram_word_analyzer.h index 6e5542ecd..95b8569e9 100644 --- a/include/analyzers/ngram/ngram_word_analyzer.h +++ b/include/analyzers/ngram/ngram_word_analyzer.h @@ -33,11 +33,12 @@ namespace analyzers * * @see https://meta-toolkit.org/analyzers-filters-tutorial.html */ +template class ngram_word_analyzer - : public util::multilevel_clonable + : public util::multilevel_clonable, ngram_analyzer, + ngram_word_analyzer> { - using base = util::multilevel_clonable, ngram_analyzer, ngram_word_analyzer>; public: @@ -69,12 +70,20 @@ class ngram_word_analyzer }; /** - * Specialization of the factory method for creating ngram_word_analyzers. + * Specialization of the traits class used by the factory method for + * creating ngram_word_analyzers. */ -template <> -std::unique_ptr - make_analyzer(const cpptoml::table&, - const cpptoml::table&); +template +struct analyzer_traits> +{ + static std::unique_ptr> create(const cpptoml::table&, + const cpptoml::table&); +}; + +extern template class ngram_word_analyzer; +extern template class ngram_word_analyzer; +extern template struct analyzer_traits>; +extern template struct analyzer_traits>; } } #endif diff --git a/include/parser/analyzers/tree_analyzer.h b/include/parser/analyzers/tree_analyzer.h index 035705d6d..2aaa8d486 100644 --- a/include/parser/analyzers/tree_analyzer.h +++ b/include/parser/analyzers/tree_analyzer.h @@ -41,7 +41,8 @@ namespace analyzers * @see https://meta-toolkit.org/analyzers-filters-tutorial.html */ -class tree_analyzer : public util::clonable +template +class tree_analyzer : public util::clonable, tree_analyzer> { public: /** @@ -104,11 +105,20 @@ class tree_analyzer : public util::clonable }; /** - * Specialization of the factory method for creating tree analyzers. + * Specialization of the traits class used by the factory method for + * creating tree analyzers. */ -template <> -std::unique_ptr make_analyzer(const cpptoml::table&, - const cpptoml::table&); +template +struct analyzer_traits> +{ + static std::unique_ptr> create(const cpptoml::table&, + const cpptoml::table&); +}; + +extern template class tree_analyzer; +extern template class tree_analyzer; +extern template struct analyzer_traits>; +extern template struct analyzer_traits>; } namespace parser diff --git a/include/sequence/analyzers/ngram_pos_analyzer.h b/include/sequence/analyzers/ngram_pos_analyzer.h index 2547845cb..414cca44b 100644 --- a/include/sequence/analyzers/ngram_pos_analyzer.h +++ b/include/sequence/analyzers/ngram_pos_analyzer.h @@ -41,13 +41,13 @@ namespace analyzers * Optional config parameters: none. * * @see https://meta-toolkit.org/analyzers-filters-tutorial.html - */ +template class ngram_pos_analyzer - : public util::multilevel_clonable + : public util::multilevel_clonable, ngram_analyzer, + ngram_pos_analyzer> { - using base = util::multilevel_clonable, ngram_analyzer, ngram_pos_analyzer>; public: @@ -87,12 +87,20 @@ class ngram_pos_analyzer }; /** - * Specialization of the factory method for creating ngram_pos_analyzers. + * Specialization of the traits class used by the factory method for + * creating ngram_pos_analyzers. */ -template <> -std::unique_ptr - make_analyzer(const cpptoml::table&, - const cpptoml::table&); +template +struct analyzer_traits> +{ + static std::unique_ptr> create(const cpptoml::table&, + const cpptoml::table&); +}; + +extern template class ngram_pos_analyzer; +extern template class ngram_pos_analyzer; +extern template struct analyzer_traits>; +extern template struct analyzer_traits>; } namespace sequence diff --git a/src/analyzers/analyzer.cpp b/src/analyzers/analyzer.cpp index 224eef9fb..1e05aec66 100644 --- a/src/analyzers/analyzer.cpp +++ b/src/analyzers/analyzer.cpp @@ -101,10 +101,11 @@ std::unique_ptr load_filters(const cpptoml::table& global, return result; } -std::unique_ptr load(const cpptoml::table& config) +template +std::unique_ptr> load(const cpptoml::table& config) { using namespace analyzers; - std::vector> toks; + std::vector>> toks; auto analyzers = config.get_table_array("analyzers"); for (auto group : analyzers->get()) { @@ -112,9 +113,14 @@ std::unique_ptr load(const cpptoml::table& config) if (!method) throw analyzer_exception{"failed to find analyzer method"}; toks.emplace_back( - analyzer_factory::get().create(*method, config, *group)); + analyzer_factory::get().create(*method, config, *group)); } - return make_unique(std::move(toks)); + return make_unique>(std::move(toks)); } + +// explicitly instantiate the load template function for the two valid +// feature value types for analyzers +template std::unique_ptr> load(const cpptoml::table&); +template std::unique_ptr> load(const cpptoml::table&); } } diff --git a/src/analyzers/analyzer_factory.cpp b/src/analyzers/analyzer_factory.cpp index 3c9fd28b0..1d039051f 100644 --- a/src/analyzers/analyzer_factory.cpp +++ b/src/analyzers/analyzer_factory.cpp @@ -11,16 +11,22 @@ namespace meta namespace analyzers { +template template -void analyzer_factory::register_analyzer() +void analyzer_factory::register_analyzer() { - add(Analyzer::id, make_analyzer); + // this-> needed to find the add() method in dependent base class + this->add(Analyzer::id, make_analyzer); } -analyzer_factory::analyzer_factory() +template +analyzer_factory::analyzer_factory() { // built-in analyzers - register_analyzer(); + register_analyzer>(); } + +template class analyzer_factory; +template class analyzer_factory; } } diff --git a/src/analyzers/multi_analyzer.cpp b/src/analyzers/multi_analyzer.cpp index 7bd0e7b1f..cfd9fdb08 100644 --- a/src/analyzers/multi_analyzer.cpp +++ b/src/analyzers/multi_analyzer.cpp @@ -9,22 +9,30 @@ namespace meta namespace analyzers { -multi_analyzer::multi_analyzer(std::vector>&& toks) +template +multi_analyzer::multi_analyzer( + std::vector>>&& toks) : analyzers_{std::move(toks)} -{/* nothing */ +{ + /* nothing */ } -multi_analyzer::multi_analyzer(const multi_analyzer& other) +template +multi_analyzer::multi_analyzer(const multi_analyzer& other) { analyzers_.reserve(other.analyzers_.size()); for (const auto& an : other.analyzers_) analyzers_.emplace_back(an->clone()); } -void multi_analyzer::tokenize(corpus::document& doc) +template +void multi_analyzer::tokenize(corpus::document& doc) { for (auto& tok : analyzers_) tok->tokenize(doc); } + +template class multi_analyzer; +template class multi_analyzer; } } diff --git a/src/analyzers/ngram/ngram_analyzer.cpp b/src/analyzers/ngram/ngram_analyzer.cpp index 1f2918c2b..ddad4a534 100644 --- a/src/analyzers/ngram/ngram_analyzer.cpp +++ b/src/analyzers/ngram/ngram_analyzer.cpp @@ -10,22 +10,30 @@ namespace meta namespace analyzers { -ngram_analyzer::ngram_analyzer(uint16_t n) : n_val_{n} +template +ngram_analyzer::ngram_analyzer(uint16_t n) + : n_val_{n} { /* nothing */ } -uint16_t ngram_analyzer::n_value() const +template +uint16_t ngram_analyzer::n_value() const { return n_val_; } -std::string ngram_analyzer::wordify(const std::deque& words) const +template +std::string + ngram_analyzer::wordify(const std::deque& words) const { std::string result = ""; for (auto& word : words) result += (word + "_"); return result.substr(0, result.size() - 1); } + +template class ngram_analyzer; +template class ngram_analyzer; } } diff --git a/src/analyzers/ngram/ngram_word_analyzer.cpp b/src/analyzers/ngram/ngram_word_analyzer.cpp index 545fbc8ec..dd869caef 100644 --- a/src/analyzers/ngram/ngram_word_analyzer.cpp +++ b/src/analyzers/ngram/ngram_word_analyzer.cpp @@ -16,29 +16,33 @@ namespace meta namespace analyzers { -const std::string ngram_word_analyzer::id = "ngram-word"; +template +const std::string ngram_word_analyzer::id = "ngram-word"; -ngram_word_analyzer::ngram_word_analyzer(uint16_t n, - std::unique_ptr stream) +template +ngram_word_analyzer::ngram_word_analyzer( + uint16_t n, std::unique_ptr stream) : base{n}, stream_{std::move(stream)} { // nothing } -ngram_word_analyzer::ngram_word_analyzer(const ngram_word_analyzer& other) +template +ngram_word_analyzer::ngram_word_analyzer(const ngram_word_analyzer& other) : base{other.n_value()}, stream_{other.stream_->clone()} { // nothing } -void ngram_word_analyzer::tokenize(corpus::document& doc) +template +void ngram_word_analyzer::tokenize(corpus::document& doc) { stream_->set_content(get_content(doc)); std::deque tokens; while (*stream_) { tokens.emplace_back(stream_->next()); - if (tokens.size() == n_value()) + if (tokens.size() == this->n_value()) { auto combined = std::move(tokens.front()); tokens.pop_front(); @@ -50,10 +54,9 @@ void ngram_word_analyzer::tokenize(corpus::document& doc) } } -template <> -std::unique_ptr - make_analyzer(const cpptoml::table& global, - const cpptoml::table& config) +template +std::unique_ptr> analyzer_traits>::create( + const cpptoml::table& global, const cpptoml::table& config) { auto n_val = config.get_as("ngram"); if (!n_val) @@ -61,7 +64,12 @@ std::unique_ptr "ngram size needed for ngram word analyzer in config file"}; auto filts = load_filters(global, config); - return make_unique(*n_val, std::move(filts)); + return make_unique>(*n_val, std::move(filts)); } + +template class ngram_word_analyzer; +template class ngram_word_analyzer; +template struct analyzer_traits>; +template struct analyzer_traits>; } } diff --git a/src/analyzers/tools/tokenize_test.cpp b/src/analyzers/tools/tokenize_test.cpp index b8f79057a..63d5d2851 100644 --- a/src/analyzers/tools/tokenize_test.cpp +++ b/src/analyzers/tools/tokenize_test.cpp @@ -31,7 +31,7 @@ int main(int argc, char** argv) if (!method) continue; - if (*method == analyzers::ngram_word_analyzer::id) + if (*method == analyzers::ngram_word_analyzer::id) { stream = analyzers::load_filters(config, *group); break; diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 6f75d0404..fd4b39041 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -48,7 +48,7 @@ class forward_index::impl * merged. */ void tokenize_docs(corpus::corpus* corpus, - const analyzers::analyzer& analyzer, + const analyzers::analyzer& analyzer, metadata_writer& mdata_writer, uint64_t ram_budget); /** @@ -231,7 +231,7 @@ void forward_index::create_index(const std::string& config_file) auto docs = corpus::corpus::load(config_file); { - auto analyzer = analyzers::load(config); + auto analyzer = analyzers::load(config); metadata_writer mdata_writer{index_name(), docs->size(), docs->schema()}; @@ -263,7 +263,7 @@ void forward_index::create_index(const std::string& config_file) } void forward_index::impl::tokenize_docs(corpus::corpus* docs, - const analyzers::analyzer& ana, + const analyzers::analyzer& ana, metadata_writer& mdata_writer, uint64_t ram_budget) { diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 0363b53a8..e4903e242 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -66,7 +66,7 @@ class inverted_index::impl void load_postings(); /// The analyzer used to tokenize documents. - std::unique_ptr analyzer_; + std::unique_ptr> analyzer_; util::optional> postings_; @@ -76,7 +76,9 @@ class inverted_index::impl }; inverted_index::impl::impl(inverted_index* idx, const cpptoml::table& config) - : idx_{idx}, analyzer_{analyzers::load(config)}, total_corpus_terms_{0} + : idx_{idx}, + analyzer_{analyzers::load(config)}, + total_corpus_terms_{0} { // nothing } diff --git a/src/parser/analyzers/tree_analyzer.cpp b/src/parser/analyzers/tree_analyzer.cpp index d57236b5f..66a43f52a 100644 --- a/src/parser/analyzers/tree_analyzer.cpp +++ b/src/parser/analyzers/tree_analyzer.cpp @@ -14,11 +14,13 @@ namespace meta namespace analyzers { -const std::string tree_analyzer::id = "tree"; +template +const std::string tree_analyzer::id = "tree"; -tree_analyzer::tree_analyzer(std::unique_ptr stream, - const std::string& tagger_prefix, - const std::string& parser_prefix) +template +tree_analyzer::tree_analyzer(std::unique_ptr stream, + const std::string& tagger_prefix, + const std::string& parser_prefix) : featurizers_{std::make_shared()}, stream_{std::move(stream)}, tagger_{std::make_shared(tagger_prefix)}, @@ -27,7 +29,8 @@ tree_analyzer::tree_analyzer(std::unique_ptr stream, // nothing } -tree_analyzer::tree_analyzer(const tree_analyzer& other) +template +tree_analyzer::tree_analyzer(const tree_analyzer& other) : featurizers_{other.featurizers_}, stream_{other.stream_->clone()}, tagger_{other.tagger_}, @@ -36,12 +39,14 @@ tree_analyzer::tree_analyzer(const tree_analyzer& other) // nothing } -void tree_analyzer::add(std::unique_ptr featurizer) +template +void tree_analyzer::add(std::unique_ptr featurizer) { featurizers_->emplace_back(std::move(featurizer)); } -void tree_analyzer::tokenize(corpus::document& doc) +template +void tree_analyzer::tokenize(corpus::document& doc) { stream_->set_content(get_content(doc)); @@ -68,10 +73,10 @@ void tree_analyzer::tokenize(corpus::document& doc) } } -template <> -std::unique_ptr - make_analyzer(const cpptoml::table& global, - const cpptoml::table& config) +template +std::unique_ptr> + analyzer_traits>::create(const cpptoml::table& global, + const cpptoml::table& config) { auto tagger_prefix = config.get_as("tagger"); if (!tagger_prefix) @@ -87,21 +92,28 @@ std::unique_ptr "tree analyzer needs an array of features to generate"}; auto filts = load_filters(global, config); - auto ana = make_unique(std::move(filts), *tagger_prefix, - *parser_prefix); + auto ana = make_unique>(std::move(filts), *tagger_prefix, + *parser_prefix); for (const auto& feat : feat_arr->array_of()) ana->add(featurizer_factory::get().create(feat->get())); return std::move(ana); } + +template class tree_analyzer; +template class tree_analyzer; +template struct analyzer_traits>; +template struct analyzer_traits>; } namespace parser { void register_analyzers() { - analyzers::register_analyzer(); + using namespace analyzers; + register_analyzer>(); + register_analyzer>(); } } } diff --git a/src/sequence/analyzers/ngram_pos_analyzer.cpp b/src/sequence/analyzers/ngram_pos_analyzer.cpp index 9cac3ed2e..53906db37 100644 --- a/src/sequence/analyzers/ngram_pos_analyzer.cpp +++ b/src/sequence/analyzers/ngram_pos_analyzer.cpp @@ -15,11 +15,13 @@ namespace meta namespace analyzers { -const std::string ngram_pos_analyzer::id = "ngram-pos"; +template +const std::string ngram_pos_analyzer::id = "ngram-pos"; -ngram_pos_analyzer::ngram_pos_analyzer(uint16_t n, - std::unique_ptr stream, - const std::string& crf_prefix) +template +ngram_pos_analyzer::ngram_pos_analyzer(uint16_t n, + std::unique_ptr stream, + const std::string& crf_prefix) : base{n}, stream_{std::move(stream)}, crf_{std::make_shared(crf_prefix)}, @@ -30,9 +32,11 @@ ngram_pos_analyzer::ngram_pos_analyzer(uint16_t n, return ana; }()} { + // nothing } -ngram_pos_analyzer::ngram_pos_analyzer(const ngram_pos_analyzer& other) +template +ngram_pos_analyzer::ngram_pos_analyzer(const ngram_pos_analyzer& other) : base{other.n_value()}, stream_{other.stream_->clone()}, crf_{other.crf_}, @@ -41,7 +45,8 @@ ngram_pos_analyzer::ngram_pos_analyzer(const ngram_pos_analyzer& other) // nothing } -void ngram_pos_analyzer::tokenize(corpus::document& doc) +template +void ngram_pos_analyzer::tokenize(corpus::document& doc) { // first, get tokens stream_->set_content(get_content(doc)); @@ -72,10 +77,10 @@ void ngram_pos_analyzer::tokenize(corpus::document& doc) tagger.tag(seq); // create ngrams - for (size_t i = n_value() - 1; i < seq.size(); ++i) + for (size_t i = this->n_value() - 1; i < seq.size(); ++i) { std::string combined = seq_analyzer_.tag(seq[i].label()); - for (size_t j = 1; j < n_value(); ++j) + for (size_t j = 1; j < this->n_value(); ++j) { std::string next = seq_analyzer_.tag(seq[i - j].label()); combined = next + "_" + combined; @@ -86,10 +91,10 @@ void ngram_pos_analyzer::tokenize(corpus::document& doc) } } -template <> -std::unique_ptr - make_analyzer(const cpptoml::table& global, - const cpptoml::table& config) +template +std::unique_ptr> + analyzer_traits>::create(const cpptoml::table& global, + const cpptoml::table& config) { auto n_val = config.get_as("ngram"); if (!n_val) @@ -102,16 +107,23 @@ std::unique_ptr "ngram-pos analyzer must contain a prefix to a crf model"}; auto filts = load_filters(global, config); - return make_unique(*n_val, std::move(filts), - *crf_prefix); + return make_unique>(*n_val, std::move(filts), + *crf_prefix); } + +template class ngram_pos_analyzer; +template class ngram_pos_analyzer; +template struct analyzer_traits>; +template struct analyzer_traits>; } namespace sequence { void register_analyzers() { - analyzers::register_analyzer(); + using namespace analyzers; + register_analyzer>(); + register_analyzer>(); } } } diff --git a/src/test/analyzer_test.cpp b/src/test/analyzer_test.cpp index 58e26027f..9dd3b4089 100644 --- a/src/test/analyzer_test.cpp +++ b/src/test/analyzer_test.cpp @@ -46,19 +46,19 @@ int content_tokenize() num_failed += testing::run_test("content-unigram-word-analyzer", [&]() { - analyzers::ngram_word_analyzer tok{1, make_filter()}; + analyzers::ngram_word_analyzer tok{1, make_filter()}; check_analyzer_expected(tok, doc, 6, 8); }); num_failed += testing::run_test("content-bigram-word-analyzer", [&]() { - analyzers::ngram_word_analyzer tok{2, make_filter()}; + analyzers::ngram_word_analyzer tok{2, make_filter()}; check_analyzer_expected(tok, doc, 6, 7); }); num_failed += testing::run_test("content-trigram-word-analyzer", [&]() { - analyzers::ngram_word_analyzer tok{3, make_filter()}; + analyzers::ngram_word_analyzer tok{3, make_filter()}; check_analyzer_expected(tok, doc, 6, 6); }); @@ -73,19 +73,19 @@ int file_tokenize() num_failed += testing::run_test("file-unigram-word-analyzer", [&]() { - analyzers::ngram_word_analyzer tok{1, make_filter()}; + analyzers::ngram_word_analyzer tok{1, make_filter()}; check_analyzer_expected(tok, doc, 93, 168); }); num_failed += testing::run_test("file-bigram-word-analyzer", [&]() { - analyzers::ngram_word_analyzer tok{2, make_filter()}; + analyzers::ngram_word_analyzer tok{2, make_filter()}; check_analyzer_expected(tok, doc, 140, 167); }); num_failed += testing::run_test("file-trigram-word-analyzer", [&]() { - analyzers::ngram_word_analyzer tok{3, make_filter()}; + analyzers::ngram_word_analyzer tok{3, make_filter()}; check_analyzer_expected(tok, doc, 159, 166); }); diff --git a/src/tools/profile.cpp b/src/tools/profile.cpp index de5cdc30f..8cf7171a3 100644 --- a/src/tools/profile.cpp +++ b/src/tools/profile.cpp @@ -281,7 +281,7 @@ void freq(const std::string& file, const cpptoml::table&, uint16_t n) std::unique_ptr stream = make_unique(); - analyzers::ngram_word_analyzer ana{n, std::move(stream)}; + analyzers::ngram_word_analyzer ana{n, std::move(stream)}; corpus::document doc; doc.content(filesystem::file_text(file)); From 9ce60b1da64ea8083fa29edce9556e44d07482f4 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 5 Sep 2015 20:49:24 -0500 Subject: [PATCH 240/481] Do not store count information in corpus::document. The analyzers themselves are now responsible for returning the result of their analysis directly rather than storing it implicitly in the document object. This actually is a bit nicer, I think, since the document object is now purely responsible for representing unanalyzed content. A... couple of files changed. --- include/analyzers/analyzer.h | 27 +++- include/analyzers/multi_analyzer.h | 6 +- include/analyzers/ngram/ngram_word_analyzer.h | 13 +- include/corpus/document.h | 34 ----- include/index/inverted_index.h | 5 +- include/index/ranker/ranker.h | 117 ++++++++++++++++-- include/index/score_data.h | 8 +- .../analyzers/featurizers/branch_featurizer.h | 15 ++- .../analyzers/featurizers/depth_featurizer.h | 15 ++- .../featurizers/featurizer_factory.h | 27 +++- .../featurizers/semi_skeleton_featurizer.h | 15 ++- .../featurizers/skeleton_featurizer.h | 15 ++- .../featurizers/subtree_featurizer.h | 15 ++- .../analyzers/featurizers/tag_featurizer.h | 16 ++- .../analyzers/featurizers/tree_featurizer.h | 13 +- include/parser/analyzers/tree_analyzer.h | 18 +-- .../sequence/analyzers/ngram_pos_analyzer.h | 17 +-- src/analyzers/analyzer.cpp | 11 ++ src/analyzers/multi_analyzer.cpp | 5 +- src/analyzers/ngram/ngram_word_analyzer.cpp | 5 +- src/classify/classifier/knn.cpp | 23 ++-- src/corpus/document.cpp | 24 +--- src/index/forward_index.cpp | 24 ++-- src/index/inverted_index.cpp | 21 ++-- src/index/ranker/lm_ranker.cpp | 2 +- src/index/ranker/ranker.cpp | 90 ++++---------- .../featurizers/branch_featurizer.cpp | 22 ++-- .../featurizers/depth_featurizer.cpp | 13 +- .../featurizers/featurizer_factory.cpp | 24 ++-- .../featurizers/semi_skeleton_featurizer.cpp | 27 ++-- .../featurizers/skeleton_featurizer.cpp | 31 +++-- .../featurizers/subtree_featurizer.cpp | 36 +++--- .../analyzers/featurizers/tag_featurizer.cpp | 30 +++-- src/parser/analyzers/tree_analyzer.cpp | 9 +- src/sequence/analyzers/ngram_pos_analyzer.cpp | 5 +- src/test/analyzer_test.cpp | 90 ++++++++------ src/tools/profile.cpp | 4 +- 37 files changed, 528 insertions(+), 344 deletions(-) diff --git a/include/analyzers/analyzer.h b/include/analyzers/analyzer.h index dec447d3c..e8c93fab0 100644 --- a/include/analyzers/analyzer.h +++ b/include/analyzers/analyzer.h @@ -13,8 +13,8 @@ #include #include - -#include "io/parser.h" +#include +#include namespace cpptoml { @@ -34,6 +34,9 @@ namespace analyzers class token_stream; +template +class multi_analyzer; + /** * An class that provides a framework to produce token counts from documents. * All analyzers inherit from this class and (possibly) implement tokenize(). @@ -57,6 +60,8 @@ class analyzer using base_type = analyzer; using feature_value_type = T; + using feature_map = std::unordered_map; + /** * A default virtual destructor. */ @@ -64,14 +69,28 @@ class analyzer /** * Tokenizes a document. - * @param doc The document to store the tokenized information in + * @param doc The document to be tokenized + * @return a feature_map that maps the observed features to their + * counts in the document */ - virtual void tokenize(corpus::document& doc) = 0; + feature_map analyze(const corpus::document& doc); /** * Clones this analyzer. */ virtual std::unique_ptr clone() const = 0; + + friend multi_analyzer; + + private: + /** + * The tokenization function that actually does the heavy lifting. This + * should be overridden in derived classes. + * + * @param doc The document to be tokenized + * @param counts The feature_map to place observed feature counts into + */ + virtual void tokenize(const corpus::document& doc, feature_map& counts) = 0; }; /** diff --git a/include/analyzers/multi_analyzer.h b/include/analyzers/multi_analyzer.h index 2ae6021ad..42002b425 100644 --- a/include/analyzers/multi_analyzer.h +++ b/include/analyzers/multi_analyzer.h @@ -33,6 +33,8 @@ template class multi_analyzer : public util::clonable, multi_analyzer> { public: + using feature_map = typename analyzer::feature_map; + /** * Constructs a multi_analyzer from a vector of other analyzers. * @param toks A vector of analyzers to combine features from @@ -45,11 +47,13 @@ class multi_analyzer : public util::clonable, multi_analyzer> */ multi_analyzer(const multi_analyzer& other); + private: /** * Tokenizes a file into a document. * @param doc The document to store the tokenized information in */ - virtual void tokenize(corpus::document& doc) override; + virtual void tokenize(const corpus::document& doc, + feature_map& counts) override; private: /// Holds all the analyzers in this multi_analyzer diff --git a/include/analyzers/ngram/ngram_word_analyzer.h b/include/analyzers/ngram/ngram_word_analyzer.h index 95b8569e9..bf800b19c 100644 --- a/include/analyzers/ngram/ngram_word_analyzer.h +++ b/include/analyzers/ngram/ngram_word_analyzer.h @@ -42,6 +42,8 @@ class ngram_word_analyzer ngram_word_analyzer>; public: + using feature_map = typename ngram_word_analyzer::feature_map; + /** * Constructor. * @param n The value of n to use for the ngrams. @@ -55,16 +57,17 @@ class ngram_word_analyzer */ ngram_word_analyzer(const ngram_word_analyzer& other); + /// Identifier for this analyzer. + const static std::string id; + + private: /** * Tokenizes a file into a document. * @param doc The document to store the tokenized information in */ - virtual void tokenize(corpus::document& doc) override; + virtual void tokenize(const corpus::document& doc, + feature_map& counts) override; - /// Identifier for this analyzer. - const static std::string id; - - private: /// The token stream to be used for extracting tokens std::unique_ptr stream_; }; diff --git a/include/corpus/document.h b/include/corpus/document.h index aebeebfed..bf4563378 100644 --- a/include/corpus/document.h +++ b/include/corpus/document.h @@ -41,43 +41,15 @@ class document document(doc_id d_id = doc_id{0}, const class_label& label = class_label{"[NONE]"}); - /** - * Increment the count of the specified transition. - * @param term The string token whose count to increment - * @param amount The amount to increment by - */ - void increment(const std::string& term, double amount); - /** * @return the classification category this document is in */ const class_label& label() const; - /** - * @return the total of transitions recorded for this document. - * This is not the number of unique transitions. - */ - uint64_t length() const; - - /** - * Get the number of occurrences for a particular term. - * @param term The string term to look up - * @return the number of times term appears in this document - */ - double count(const std::string& term) const; - - /** - * @return the map of counts for this document. - */ - const std::unordered_map& counts() const; - /** * Sets the content of the document to be the parameter * @param content The string content to assign into this document * @param encoding the encoding of content, which defaults to utf-8 - * @note saving the document's content is only used by some corpora - * formats; not all documents are guaranteed to have content stored in - * the object itself */ void content(const std::string& content, const std::string& encoding = "utf-8"); @@ -135,12 +107,6 @@ class document /// Other metadata fields for this document std::vector mdata_; - /// The number of (non-unique) tokens in this document - size_t length_; - - /// Counts of how many times each token appears - std::unordered_map counts_; - /// What the document contains util::optional content_; diff --git a/include/index/inverted_index.h b/include/index/inverted_index.h index d8596fe0f..3ad99c920 100644 --- a/include/index/inverted_index.h +++ b/include/index/inverted_index.h @@ -14,6 +14,7 @@ #include #include +#include "analyzers/analyzer.h" #include "index/disk_index.h" #include "index/make_index.h" #include "index/postings_stream.h" @@ -120,8 +121,10 @@ class inverted_index : public disk_index /** * @param doc The document to tokenize + * @return the analyzed version of the document */ - void tokenize(corpus::document& doc); + analyzers::analyzer::feature_map + tokenize(const corpus::document& doc); /** * @param t_id The term_id to search for diff --git a/include/index/ranker/ranker.h b/include/index/ranker/ranker.h index b895c1936..12249d134 100644 --- a/include/index/ranker/ranker.h +++ b/include/index/ranker/ranker.h @@ -13,6 +13,7 @@ #include #include "meta.h" +#include "index/inverted_index.h" namespace meta { @@ -24,7 +25,6 @@ class document; namespace index { -class inverted_index; struct score_data; } } @@ -41,11 +41,83 @@ struct search_result { search_result(doc_id id, float s) : d_id{id}, score{s} { + // nothing } doc_id d_id; float score; }; +/** + * Implementation details for indexing and ranking implementations. + */ +namespace detail +{ +struct postings_context +{ + using postings_data_type = inverted_index::postings_data_type; + using iterator = postings_stream::iterator; + + postings_stream stream; + iterator begin; + iterator end; + term_id t_id; + double query_term_weight; + uint64_t doc_count; + uint64_t corpus_term_count; + + postings_context(postings_stream strm, double qtf, term_id term) + : stream{std::move(strm)}, + begin{stream.begin()}, + end{stream.end()}, + t_id{term}, + query_term_weight{qtf}, + doc_count{stream.size()}, + corpus_term_count{stream.total_counts()} + { + // nothing + } +}; + +struct ranker_context +{ + template + ranker_context(inverted_index& inv, ForwardIterator begin, + ForwardIterator end, FilterFunction&& filter) + : idx(inv), cur_doc{idx.num_docs()} + { + postings.reserve(std::distance(begin, end)); + + query_length = 0.0; + for (; begin != end; ++begin) + { + const auto& count = *begin; + query_length += count.second; + auto term = idx.get_term_id(count.first); + auto pstream = idx.stream_for(term); + if (!pstream) + continue; + + postings.emplace_back(*pstream, count.second, term); + + while (postings.back().begin != postings.back().end + && !filter(postings.back().begin->first)) + ++postings.back().begin; + + if (postings.back().begin != postings.back().end) + { + if (postings.back().begin->first < cur_doc) + cur_doc = postings.back().begin->first; + } + } + } + + inverted_index& idx; + std::vector postings; + double query_length; + doc_id cur_doc; +}; +} + /** * A ranker scores a query against all the documents in an inverted index, * returning a list of documents sorted by relevance. @@ -53,20 +125,44 @@ struct search_result class ranker { public: + using filter_function_type = std::function; + /** * @param idx The index this ranker is operating on - * @param query The current query + * @param begin A forward iterator to the beginning of the term + * weights (pairs of std::string and a weight) + * @param end A forward iterator to the end of the above range * @param num_results The number of results to return in the vector * @param filter A filtering function to apply to each doc_id; returns true * if the document should be included in results */ + template std::vector - score(inverted_index& idx, corpus::document& query, - uint64_t num_results = 10, - const std::function& filter = [](doc_id) - { - return true; - }); + score(inverted_index& idx, ForwardIterator begin, ForwardIterator end, + uint64_t num_results = 10, FilterFunction&& filter = [](doc_id) + { + return true; + }) + { + detail::ranker_context ctx{idx, begin, end, filter}; + return rank(ctx, num_results, filter); + } + + /** + * @param idx The index this ranker is operating on + * @param query The current query + * @param num_results The number of results to return in the vector + * @param filter A filtering function to apply to each doc_id; returns + * true if the document should be included in results + */ + std::vector score(inverted_index& idx, + const corpus::document& query, + uint64_t num_results = 10, + const filter_function_type& filter + = [](doc_id) + { + return true; + }); /** * Computes the contribution to the score of a document for a matched @@ -86,6 +182,11 @@ class ranker * Default destructor. */ virtual ~ranker() = default; + + private: + std::vector rank(detail::ranker_context& ctx, + uint64_t num_results, + const filter_function_type& filter); }; } } diff --git a/include/index/score_data.h b/include/index/score_data.h index c7f9ed975..2fe0995b7 100644 --- a/include/index/score_data.h +++ b/include/index/score_data.h @@ -48,8 +48,8 @@ struct score_data uint64_t num_docs; /// total number of terms in the index uint64_t total_terms; - /// the current query - const corpus::document& query; + /// the total length of the query (sum of all term weights) + double query_length; // term-based info @@ -82,12 +82,12 @@ struct score_data * @param p_query The current query */ score_data(inverted_index& p_idx, double p_avg_dl, uint64_t p_num_docs, - uint64_t p_total_terms, const corpus::document& p_query) + uint64_t p_total_terms, double p_length) : idx(p_idx), // gcc no non-const ref init from brace init list avg_dl{p_avg_dl}, num_docs{p_num_docs}, total_terms{p_total_terms}, - query(p_query) // gcc no non-const ref init from brace init list + query_length{p_length} { /* nothing */ } diff --git a/include/parser/analyzers/featurizers/branch_featurizer.h b/include/parser/analyzers/featurizers/branch_featurizer.h index fc27dcd96..75e98b0fe 100644 --- a/include/parser/analyzers/featurizers/branch_featurizer.h +++ b/include/parser/analyzers/featurizers/branch_featurizer.h @@ -21,22 +21,27 @@ namespace analyzers /** * Tokenizes parse trees by extracting branching factor features. */ +template class branch_featurizer - : public util::clonable + : public util::clonable, branch_featurizer> { public: + using feature_map = typename branch_featurizer::feature_map; + /** * Keeps track of the branching factor for this document's parse_trees. - * @param doc The document to parse * @param tree The current parse_tree in the document + * @param counts The feature_map to write to */ - void tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const override; + void tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const override; /// Identifier for this featurizer const static std::string id; }; + +extern template class branch_featurizer; +extern template class branch_featurizer; } } - #endif diff --git a/include/parser/analyzers/featurizers/depth_featurizer.h b/include/parser/analyzers/featurizers/depth_featurizer.h index 0e910716e..a066d58aa 100644 --- a/include/parser/analyzers/featurizers/depth_featurizer.h +++ b/include/parser/analyzers/featurizers/depth_featurizer.h @@ -21,22 +21,27 @@ namespace analyzers /** * Tokenizes parse trees by extracting depth features. */ +template class depth_featurizer - : public util::clonable + : public util::clonable, depth_featurizer> { public: + using feature_map = typename depth_featurizer::feature_map; + /** * Extracts the height of each parse tree. - * @param doc The document to parse * @param tree The current parse_tree in the document + * @param counts The feature_map to write to */ - void tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const override; + void tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const override; /// Identifier for this featurizer const static std::string id; }; + +extern template class depth_featurizer; +extern template class depth_featurizer; } } - #endif diff --git a/include/parser/analyzers/featurizers/featurizer_factory.h b/include/parser/analyzers/featurizers/featurizer_factory.h index 828e61dba..41328010a 100644 --- a/include/parser/analyzers/featurizers/featurizer_factory.h +++ b/include/parser/analyzers/featurizers/featurizer_factory.h @@ -29,9 +29,13 @@ namespace analyzers * configuration files. Clients should use the register_featurizer method * instead of this class directly. */ +template class featurizer_factory - : public util::factory + : public util::factory, tree_featurizer> { + using base_factory = typename featurizer_factory::base_factory; + using factory_method = typename featurizer_factory::factory_method; + /// friend the base class friend base_factory; @@ -52,13 +56,27 @@ class featurizer_factory std::unordered_map methods_; }; +/** + * Traits class for featurizers. You should specialize this class if you + * need to customize creation behavior for your featurizer class. This is a + * class template to allow for partial specializations as well. + */ +template +struct featurizer_traits +{ + static std::unique_ptr create() + { + return make_unique(); + } +}; + /** * Factory method for creating a featurizer. */ template -std::unique_ptr make_featurizer() +std::unique_ptr make_featurizer() { - return make_unique(); + return featurizer_traits::create(); } /** @@ -68,7 +86,8 @@ std::unique_ptr make_featurizer() template void register_featurizer() { - featurizer_factory::get().add(Featurizer::id, make_featurizer); + featurizer_factory::get().add( + Featurizer::id, make_featurizer); } } } diff --git a/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h b/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h index fe5f3d322..5a3591a8b 100644 --- a/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h +++ b/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h @@ -22,22 +22,27 @@ namespace analyzers * Tokenizes parse trees by keeping track of only a single node label and * the underlying tree structure. */ +template class semi_skeleton_featurizer - : public util::clonable + : public util::clonable, semi_skeleton_featurizer> { public: + using feature_map = typename semi_skeleton_featurizer::feature_map; + /** * Keeps track of one node's tag and the skeleton structure beneath it. - * @param doc The document to parse * @param tree The current parse_tree in the document + * @param counts The feature_map to write to */ - void tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const override; + void tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const override; /// Identifier for this featurizer const static std::string id; }; + +extern template class semi_skeleton_featurizer; +extern template class semi_skeleton_featurizer; } } - #endif diff --git a/include/parser/analyzers/featurizers/skeleton_featurizer.h b/include/parser/analyzers/featurizers/skeleton_featurizer.h index 4d0979fe0..51d2f38f2 100644 --- a/include/parser/analyzers/featurizers/skeleton_featurizer.h +++ b/include/parser/analyzers/featurizers/skeleton_featurizer.h @@ -21,22 +21,27 @@ namespace analyzers /** * Tokenizes parse trees by only tokenizing the tree structure itself. */ +template class skeleton_featurizer - : public util::clonable + : public util::clonable, skeleton_featurizer> { public: + using feature_map = typename skeleton_featurizer::feature_map; + /** * Ignores node labels and only tokenizes the tree structure. - * @param doc The document to parse * @param tree The current parse_tree in the document + * @param counts The feature_map to write to */ - void tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const override; + void tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const override; /// Identifier for this featurizer const static std::string id; }; + +extern template class skeleton_featurizer; +extern template class skeleton_featurizer; } } - #endif diff --git a/include/parser/analyzers/featurizers/subtree_featurizer.h b/include/parser/analyzers/featurizers/subtree_featurizer.h index 6b3b8f301..025853d3d 100644 --- a/include/parser/analyzers/featurizers/subtree_featurizer.h +++ b/include/parser/analyzers/featurizers/subtree_featurizer.h @@ -22,22 +22,27 @@ namespace analyzers * Tokenizes parse trees by counting occurrences of subtrees in a * document's parse tree. */ +template class subtree_featurizer - : public util::clonable + : public util::clonable, subtree_featurizer> { public: + using feature_map = typename subtree_featurizer::feature_map; + /** * Counts occurrences of subtrees in this document's parse_trees. - * @param doc The document to parse * @param tree The current parse_tree in the document + * @param counts The feature_map to write to */ - void tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const override; + void tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const override; /// Identifier for this featurizer const static std::string id; }; + +extern template class subtree_featurizer; +extern template class subtree_featurizer; } } - #endif diff --git a/include/parser/analyzers/featurizers/tag_featurizer.h b/include/parser/analyzers/featurizers/tag_featurizer.h index 05eef64f6..4c1bfe2d4 100644 --- a/include/parser/analyzers/featurizers/tag_featurizer.h +++ b/include/parser/analyzers/featurizers/tag_featurizer.h @@ -21,21 +21,27 @@ namespace analyzers /** * Tokenizes parse trees by looking at labels of leaf and interior nodes. */ -class tag_featurizer : public util::clonable +template +class tag_featurizer + : public util::clonable, tag_featurizer> { public: + using feature_map = typename tag_featurizer::feature_map; + /** * Counts occurrences of leaf and interior node labels. - * @param doc The document to parse * @param tree The current parse_tree in the document + * @param counts The feature_map to write to */ - void tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const override; + void tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const override; /// Identifier for this featurizer const static std::string id; }; + +extern template class tag_featurizer; +extern template class tag_featurizer; } } - #endif diff --git a/include/parser/analyzers/featurizers/tree_featurizer.h b/include/parser/analyzers/featurizers/tree_featurizer.h index 9dc4d8a85..e030a91d7 100644 --- a/include/parser/analyzers/featurizers/tree_featurizer.h +++ b/include/parser/analyzers/featurizers/tree_featurizer.h @@ -11,6 +11,7 @@ #include "corpus/document.h" #include "parser/trees/parse_tree.h" +#include "analyzers/analyzer.h" namespace meta { @@ -21,23 +22,27 @@ namespace analyzers * Base class for featurizers that convert trees into features in a * document. */ +template class tree_featurizer { public: + using feature_map = typename analyzer::feature_map; + using feature_value_type = T; + using base_type = tree_featurizer; + /** * Destructor. */ virtual ~tree_featurizer() = default; /** - * @param doc The document to add feature counts to * @param tree The parse tree, belonging to doc, to extract features * from + * @param counts The feature_map to write to */ - virtual void tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const = 0; + virtual void tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const = 0; }; } } - #endif diff --git a/include/parser/analyzers/tree_analyzer.h b/include/parser/analyzers/tree_analyzer.h index 2aaa8d486..1f54212c2 100644 --- a/include/parser/analyzers/tree_analyzer.h +++ b/include/parser/analyzers/tree_analyzer.h @@ -45,6 +45,8 @@ template class tree_analyzer : public util::clonable, tree_analyzer> { public: + using feature_map = typename analyzer::feature_map; + /** * Creates a tree analyzer */ @@ -58,16 +60,10 @@ class tree_analyzer : public util::clonable, tree_analyzer> */ tree_analyzer(const tree_analyzer& other); - /** - * Tokenizes a file into a document. - * @param doc The document to store the tokenized information in - */ - void tokenize(corpus::document& doc) override; - /** * Adds a tree featurizer to the list. */ - void add(std::unique_ptr featurizer); + void add(std::unique_ptr> featurizer); /** * Identifier for this analyzer. @@ -76,7 +72,13 @@ class tree_analyzer : public util::clonable, tree_analyzer> private: using tree_featurizer_list - = std::vector>; + = std::vector>>; + + /** + * Tokenizes a file into a document. + * @param doc The document to store the tokenized information in + */ + void tokenize(const corpus::document& doc, feature_map& counts) override; /** * A list of tree_featurizers to run on each parse tree. diff --git a/include/sequence/analyzers/ngram_pos_analyzer.h b/include/sequence/analyzers/ngram_pos_analyzer.h index 414cca44b..272d08275 100644 --- a/include/sequence/analyzers/ngram_pos_analyzer.h +++ b/include/sequence/analyzers/ngram_pos_analyzer.h @@ -34,8 +34,8 @@ namespace analyzers * method = "ngram-pos" # this analyzer * ngram = 1 # integer required * crf-prefix = "path" - * [[analyzers.filter]] - * type = "icu-tokenizer" # recommended + * filter = [{type = "icu-tokenizer"}, + * {type = "ptb-normalizer"}] # recommended * ~~~ * * Optional config parameters: none. @@ -50,6 +50,8 @@ class ngram_pos_analyzer using base = util::multilevel_clonable, ngram_analyzer, ngram_pos_analyzer>; + using feature_map = typename base::feature_map; + public: /** * Constructor. @@ -66,16 +68,17 @@ class ngram_pos_analyzer */ ngram_pos_analyzer(const ngram_pos_analyzer& other); + /// Identifier for this analyzer. + const static std::string id; + + private: /** * Tokenizes a file into a document. * @param doc The document to store the tokenized information in */ - virtual void tokenize(corpus::document& doc) override; + virtual void tokenize(const corpus::document& doc, + feature_map& counts) override; - /// Identifier for this analyzer. - const static std::string id; - - private: /// The token stream to be used for extracting tokens std::unique_ptr stream_; diff --git a/src/analyzers/analyzer.cpp b/src/analyzers/analyzer.cpp index 1e05aec66..a5837eaaf 100644 --- a/src/analyzers/analyzer.cpp +++ b/src/analyzers/analyzer.cpp @@ -24,6 +24,17 @@ namespace meta namespace analyzers { +template +auto analyzer::analyze(const corpus::document& doc) -> feature_map +{ + feature_map counts; + tokenize(doc, counts); + return counts; +} + +template class analyzer; +template class analyzer; + std::string get_content(const corpus::document& doc) { if (!doc.contains_content()) diff --git a/src/analyzers/multi_analyzer.cpp b/src/analyzers/multi_analyzer.cpp index cfd9fdb08..6ebbdd386 100644 --- a/src/analyzers/multi_analyzer.cpp +++ b/src/analyzers/multi_analyzer.cpp @@ -26,10 +26,11 @@ multi_analyzer::multi_analyzer(const multi_analyzer& other) } template -void multi_analyzer::tokenize(corpus::document& doc) +void multi_analyzer::tokenize(const corpus::document& doc, + feature_map& counts) { for (auto& tok : analyzers_) - tok->tokenize(doc); + tok->tokenize(doc, counts); } template class multi_analyzer; diff --git a/src/analyzers/ngram/ngram_word_analyzer.cpp b/src/analyzers/ngram/ngram_word_analyzer.cpp index dd869caef..c87f08868 100644 --- a/src/analyzers/ngram/ngram_word_analyzer.cpp +++ b/src/analyzers/ngram/ngram_word_analyzer.cpp @@ -35,7 +35,8 @@ ngram_word_analyzer::ngram_word_analyzer(const ngram_word_analyzer& other) } template -void ngram_word_analyzer::tokenize(corpus::document& doc) +void ngram_word_analyzer::tokenize(const corpus::document& doc, + feature_map& counts) { stream_->set_content(get_content(doc)); std::deque tokens; @@ -49,7 +50,7 @@ void ngram_word_analyzer::tokenize(corpus::document& doc) for (const auto& token : tokens) combined += "_" + token; - doc.increment(combined, 1); + counts[combined] += 1; } } } diff --git a/src/classify/classifier/knn.cpp b/src/classify/classifier/knn.cpp index d1d707081..a5493b405 100644 --- a/src/classify/classifier/knn.cpp +++ b/src/classify/classifier/knn.cpp @@ -27,7 +27,8 @@ knn::knn(std::shared_ptr idx, k_{k}, ranker_{std::move(ranker)}, weighted_{weighted} -{ /* nothing */ +{ + // nothing } void knn::train(const std::vector& docs) @@ -42,15 +43,19 @@ class_label knn::classify(doc_id d_id) "k must be smaller than the " "number of documents in the index (training documents)"}; - corpus::document query{d_id}; - for (const auto& count : idx_->search_primary(d_id)->counts()) - query.increment(idx_->term_text(count.first), count.second); + analyzers::analyzer::feature_map query; + { + auto pdata = idx_->search_primary(d_id); + query.reserve(pdata->counts().size()); + for (const auto& count : pdata->counts()) + query[idx_->term_text(count.first)] += count.second; + } - auto scored = ranker_->score(*inv_idx_, query, k_, [&](doc_id d_id) - { - return legal_docs_.find(d_id) - != legal_docs_.end(); - }); + auto scored = ranker_->score( + *inv_idx_, query.begin(), query.end(), k_, [&](doc_id d_id) + { + return legal_docs_.find(d_id) != legal_docs_.end(); + }); std::unordered_map counts; for (auto& s : scored) diff --git a/src/corpus/document.cpp b/src/corpus/document.cpp index d1ad06326..e982b3d0b 100644 --- a/src/corpus/document.cpp +++ b/src/corpus/document.cpp @@ -13,14 +13,9 @@ namespace corpus { document::document(doc_id d_id, const class_label& label) - : d_id_{d_id}, label_{label}, length_{0}, encoding_{"utf-8"} + : d_id_{d_id}, label_{label}, encoding_{"utf-8"} { -} - -void document::increment(const std::string& term, double amount) -{ - counts_[term] += amount; - length_ += amount; + // nothing } const class_label& document::label() const @@ -28,21 +23,6 @@ const class_label& document::label() const return label_; } -uint64_t document::length() const -{ - return length_; -} - -double document::count(const std::string& term) const -{ - return map::safe_at(counts_, term); -} - -const std::unordered_map& document::counts() const -{ - return counts_; -} - void document::content(const std::string& content, const std::string& encoding /* = "utf-8" */) { diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index fd4b39041..7b4e6a9b5 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -296,10 +296,10 @@ void forward_index::impl::tokenize_docs(corpus::corpus* docs, progress(doc->id()); } - analyzer->tokenize(*doc); + auto counts = analyzer->analyze(*doc); // warn if there is an empty document - if (doc->counts().empty()) + if (counts.empty()) { std::lock_guard lock{io_mutex}; LOG(progress) << '\n' << ENDLG; @@ -307,21 +307,27 @@ void forward_index::impl::tokenize_docs(corpus::corpus* docs, << ") generated!" << ENDLG; } - mdata_writer.write(doc->id(), doc->length(), doc->counts().size(), - doc->mdata()); + auto length = std::accumulate( + counts.begin(), counts.end(), 0ul, + [](uint64_t acc, const std::pair& count) + { + return acc + std::round(count.second); + }); + + mdata_writer.write(doc->id(), length, counts.size(), doc->mdata()); idx_->impl_->set_label(doc->id(), doc->label()); - forward_index::postings_data_type::count_t counts; - counts.reserve(doc->counts().size()); + forward_index::postings_data_type::count_t pd_counts; + pd_counts.reserve(counts.size()); { std::lock_guard lock{vocab_mutex}; - for (const auto& count : doc->counts()) + for (const auto& count : counts) { auto it = vocab.find(count.first); if (it == vocab.end()) it = vocab.insert(count.first); - counts.emplace_back(term_id{it.index()}, count.second); + pd_counts.emplace_back(term_id{it.index()}, count.second); } if (!exceeded_budget && vocab.bytes_used() > ram_budget) @@ -337,7 +343,7 @@ void forward_index::impl::tokenize_docs(corpus::corpus* docs, } forward_index::postings_data_type pdata{doc->id()}; - pdata.set_counts(std::move(counts)); + pdata.set_counts(std::move(pd_counts)); pdata.write_packed(chunk); } }; diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index e4903e242..011ef890f 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -194,10 +194,10 @@ void inverted_index::impl::tokenize_docs( progress(doc->id()); } - analyzer->tokenize(*doc); + auto counts = analyzer->analyze(*doc); // warn if there is an empty document - if (doc->counts().empty()) + if (counts.empty()) { std::lock_guard lock{mutex}; LOG(progress) << '\n' << ENDLG; @@ -205,12 +205,18 @@ void inverted_index::impl::tokenize_docs( << ") generated!" << ENDLG; } - mdata_writer.write(doc->id(), doc->length(), doc->counts().size(), - doc->mdata()); + auto length = std::accumulate( + counts.begin(), counts.end(), 0ul, + [](uint64_t acc, const std::pair& count) + { + return acc + count.second; + }); + + mdata_writer.write(doc->id(), length, counts.size(), doc->mdata()); idx_->impl_->set_label(doc->id(), doc->label()); // update chunk - producer(doc->id(), doc->counts()); + producer(doc->id(), counts); } }; @@ -306,9 +312,10 @@ double inverted_index::avg_doc_length() return static_cast(total_corpus_terms()) / num_docs(); } -void inverted_index::tokenize(corpus::document& doc) +analyzers::analyzer::feature_map + inverted_index::tokenize(const corpus::document& doc) { - inv_impl_->analyzer_->tokenize(doc); + return inv_impl_->analyzer_->analyze(doc); } uint64_t inverted_index::doc_freq(term_id t_id) const diff --git a/src/index/ranker/lm_ranker.cpp b/src/index/ranker/lm_ranker.cpp index d4d377888..711d152f1 100644 --- a/src/index/ranker/lm_ranker.cpp +++ b/src/index/ranker/lm_ranker.cpp @@ -26,7 +26,7 @@ float language_model_ranker::score_one(const score_data& sd) float language_model_ranker::initial_score(const score_data& sd) const { - return sd.query.length() * fastapprox::fastlog(doc_constant(sd)); + return sd.query_length * fastapprox::fastlog(doc_constant(sd)); } } } diff --git a/src/index/ranker/ranker.cpp b/src/index/ranker/ranker.cpp index f93f7c540..ea90a5661 100644 --- a/src/index/ranker/ranker.cpp +++ b/src/index/ranker/ranker.cpp @@ -16,45 +16,21 @@ namespace meta namespace index { -namespace +std::vector + ranker::score(inverted_index& idx, const corpus::document& query, + uint64_t num_results /* = 10 */, + const filter_function_type& filter /* return true */) { -struct postings_context -{ - using postings_data_type = inverted_index::postings_data_type; - using iterator = postings_stream::iterator; - - postings_stream stream; - iterator begin; - iterator end; - term_id t_id; - double query_term_weight; - uint64_t doc_count; - uint64_t corpus_term_count; - - postings_context(postings_stream strm, double qtf, term_id term) - : stream{std::move(strm)}, - begin{stream.begin()}, - end{stream.end()}, - t_id{term}, - query_term_weight{qtf}, - doc_count{stream.size()}, - corpus_term_count{stream.total_counts()} - { - // nothing - } -}; + auto counts = idx.tokenize(query); + return score(idx, counts.begin(), counts.end(), num_results, filter); } -std::vector ranker::score( - inverted_index& idx, corpus::document& query, - uint64_t num_results /* = 10 */, - const std::function& filter /* return true */) +std::vector ranker::rank(detail::ranker_context& ctx, + uint64_t num_results, + const filter_function_type& filter) { - if (query.counts().empty()) - idx.tokenize(query); - - score_data sd{idx, idx.avg_doc_length(), idx.num_docs(), - idx.total_corpus_terms(), query}; + score_data sd{ctx.idx, ctx.idx.avg_doc_length(), ctx.idx.num_docs(), + ctx.idx.total_corpus_terms(), ctx.query_length}; std::vector results; results.reserve(num_results + 1); // +1 since we use this as a heap and @@ -65,44 +41,20 @@ std::vector ranker::score( return a.score > b.score; }; - std::vector postings; - postings.reserve(query.counts().size()); - - doc_id cur_doc{idx.num_docs()}; - for (const auto& count : query.counts()) - { - auto term = idx.get_term_id(count.first); - auto pstream = idx.stream_for(term); - if (!pstream) - continue; - - postings.emplace_back(*pstream, count.second, term); - - while (postings.back().begin != postings.back().end - && !filter(postings.back().begin->first)) - ++postings.back().begin; - - if (postings.back().begin != postings.back().end) - { - if (postings.back().begin->first < cur_doc) - cur_doc = postings.back().begin->first; - } - } - - doc_id next_doc{idx.num_docs()}; - while (cur_doc < idx.num_docs()) + doc_id next_doc{ctx.idx.num_docs()}; + while (ctx.cur_doc < ctx.idx.num_docs()) { - sd.d_id = cur_doc; - sd.doc_size = idx.doc_size(cur_doc); - sd.doc_unique_terms = idx.unique_terms(cur_doc); + sd.d_id = ctx.cur_doc; + sd.doc_size = ctx.idx.doc_size(ctx.cur_doc); + sd.doc_unique_terms = ctx.idx.unique_terms(ctx.cur_doc); auto score = initial_score(sd); - for (auto& pc : postings) + for (auto& pc : ctx.postings) { if (pc.begin == pc.end) continue; - if (pc.begin->first == cur_doc) + if (pc.begin->first == ctx.cur_doc) { // set up this term sd.t_id = pc.t_id; @@ -131,7 +83,7 @@ std::vector ranker::score( } // add doc to the heap and poll if needed - results.emplace_back(cur_doc, score); + results.emplace_back(ctx.cur_doc, score); std::push_heap(results.begin(), results.end(), comp); if (results.size() > num_results) { @@ -139,8 +91,8 @@ std::vector ranker::score( results.pop_back(); } - cur_doc = next_doc; - next_doc = doc_id{idx.num_docs()}; + ctx.cur_doc = next_doc; + next_doc = doc_id{ctx.idx.num_docs()}; } // heap sort the values diff --git a/src/parser/analyzers/featurizers/branch_featurizer.cpp b/src/parser/analyzers/featurizers/branch_featurizer.cpp index eb1ba8c5d..aa0f7ae6a 100644 --- a/src/parser/analyzers/featurizers/branch_featurizer.cpp +++ b/src/parser/analyzers/featurizers/branch_featurizer.cpp @@ -8,14 +8,18 @@ namespace meta namespace analyzers { -const std::string branch_featurizer::id = "branch"; +template +const std::string branch_featurizer::id = "branch"; namespace { +template class branch_visitor : public parser::const_visitor { public: - branch_visitor(corpus::document& d) : doc(d) + using feature_map = typename branch_featurizer::feature_map; + + branch_visitor(feature_map& fm) : counts(fm) { // nothing } @@ -23,7 +27,7 @@ class branch_visitor : public parser::const_visitor void operator()(const parser::internal_node& in) override { auto rep = "branch-" + std::to_string(in.num_children()); - doc.increment(rep, 1); + counts[rep] += 1; in.each_child([&](const parser::node* child) { @@ -37,15 +41,19 @@ class branch_visitor : public parser::const_visitor } private: - corpus::document& doc; + feature_map& counts; }; } -void branch_featurizer::tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const +template +void branch_featurizer::tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const { - branch_visitor vtor{doc}; + branch_visitor vtor{counts}; tree.visit(vtor); } + +template class branch_featurizer; +template class branch_featurizer; } } diff --git a/src/parser/analyzers/featurizers/depth_featurizer.cpp b/src/parser/analyzers/featurizers/depth_featurizer.cpp index 1bbcf7ec5..e1eba3762 100644 --- a/src/parser/analyzers/featurizers/depth_featurizer.cpp +++ b/src/parser/analyzers/featurizers/depth_featurizer.cpp @@ -8,7 +8,8 @@ namespace meta namespace analyzers { -const std::string depth_featurizer::id = "depth"; +template +const std::string depth_featurizer::id = "depth"; namespace { @@ -34,12 +35,16 @@ class height_visitor : public parser::const_visitor }; } -void depth_featurizer::tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const +template +void depth_featurizer::tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const { height_visitor vtor; auto rep = "depth-" + std::to_string(tree.visit(vtor)); - doc.increment(rep, 1); + counts[rep] += 1; } + +template class depth_featurizer; +template class depth_featurizer; } } diff --git a/src/parser/analyzers/featurizers/featurizer_factory.cpp b/src/parser/analyzers/featurizers/featurizer_factory.cpp index 77fbe4ba0..0b495fd75 100644 --- a/src/parser/analyzers/featurizers/featurizer_factory.cpp +++ b/src/parser/analyzers/featurizers/featurizer_factory.cpp @@ -11,21 +11,27 @@ namespace meta namespace analyzers { +template template -void featurizer_factory::register_featurizer() +void featurizer_factory::register_featurizer() { - add(Featurizer::id, make_featurizer); + // this-> needed to find the add() method in dependent base class + this->add(Featurizer::id, make_featurizer); } -featurizer_factory::featurizer_factory() +template +featurizer_factory::featurizer_factory() { // built-in featurizer - register_featurizer(); - register_featurizer(); - register_featurizer(); - register_featurizer(); - register_featurizer(); - register_featurizer(); + register_featurizer>(); + register_featurizer>(); + register_featurizer>(); + register_featurizer>(); + register_featurizer>(); + register_featurizer>(); } + +template class featurizer_factory; +template class featurizer_factory; } } diff --git a/src/parser/analyzers/featurizers/semi_skeleton_featurizer.cpp b/src/parser/analyzers/featurizers/semi_skeleton_featurizer.cpp index 520f8bee9..6a0e8bf2e 100644 --- a/src/parser/analyzers/featurizers/semi_skeleton_featurizer.cpp +++ b/src/parser/analyzers/featurizers/semi_skeleton_featurizer.cpp @@ -8,14 +8,18 @@ namespace meta namespace analyzers { -const std::string semi_skeleton_featurizer::id = "semi-skel"; +template +const std::string semi_skeleton_featurizer::id = "semi-skel"; namespace { +template class semi_skeleton_visitor : public parser::const_visitor { public: - semi_skeleton_visitor(corpus::document& d) : doc(d) + using feature_map = typename semi_skeleton_featurizer::feature_map; + + semi_skeleton_visitor(feature_map& fm) : counts(fm) { // nothing } @@ -30,28 +34,31 @@ class semi_skeleton_visitor : public parser::const_visitor }); rep += ")"; - doc.increment(semi_skeleton_featurizer::id + "-" + rep_cat + rep, 1); + counts[semi_skeleton_featurizer::id + "-" + rep_cat + rep] += 1; return "(" + rep; } std::string operator()(const parser::leaf_node& ln) override { - doc.increment(semi_skeleton_featurizer::id + "-(" - + static_cast(ln.category()) + ")", - 1); + counts[semi_skeleton_featurizer::id + "-(" + + static_cast(ln.category()) + ")"] += 1; return "()"; } private: - corpus::document& doc; + feature_map& counts; }; } -void semi_skeleton_featurizer::tree_tokenize( - corpus::document& doc, const parser::parse_tree& tree) const +template +void semi_skeleton_featurizer::tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const { - semi_skeleton_visitor vtor{doc}; + semi_skeleton_visitor vtor{counts}; tree.visit(vtor); } + +template class semi_skeleton_featurizer; +template class semi_skeleton_featurizer; } } diff --git a/src/parser/analyzers/featurizers/skeleton_featurizer.cpp b/src/parser/analyzers/featurizers/skeleton_featurizer.cpp index d51a01f43..bfd6c03d3 100644 --- a/src/parser/analyzers/featurizers/skeleton_featurizer.cpp +++ b/src/parser/analyzers/featurizers/skeleton_featurizer.cpp @@ -8,14 +8,18 @@ namespace meta namespace analyzers { -const std::string skeleton_featurizer::id = "skel"; +template +const std::string skeleton_featurizer::id = "skel"; namespace { +template class skeleton_visitor : public parser::const_visitor { public: - skeleton_visitor(corpus::document& d) : doc(d) + using feature_map = typename skeleton_featurizer::feature_map; + + skeleton_visitor(feature_map& fm) : counts(fm) { // nothing } @@ -24,31 +28,36 @@ class skeleton_visitor : public parser::const_visitor { std::string rep = "("; in.each_child([&](const parser::node* child) - { - rep += child->accept(*this); - }); + { + rep += child->accept(*this); + }); rep += ")"; - doc.increment(rep, 1); + counts[rep] += 1; return rep; } std::string operator()(const parser::leaf_node&) override { std::string rep = "()"; - doc.increment(rep, 1); + counts[rep] += 1; return rep; } + private: - corpus::document& doc; + feature_map& counts; }; } -void skeleton_featurizer::tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const +template +void skeleton_featurizer::tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const { - skeleton_visitor vtor{doc}; + skeleton_visitor vtor{counts}; tree.visit(vtor); } + +template class skeleton_featurizer; +template class skeleton_featurizer; } } diff --git a/src/parser/analyzers/featurizers/subtree_featurizer.cpp b/src/parser/analyzers/featurizers/subtree_featurizer.cpp index 33f4c034a..e5121a571 100644 --- a/src/parser/analyzers/featurizers/subtree_featurizer.cpp +++ b/src/parser/analyzers/featurizers/subtree_featurizer.cpp @@ -8,14 +8,18 @@ namespace meta namespace analyzers { -const std::string subtree_featurizer::id = "subtree"; +template +const std::string subtree_featurizer::id = "subtree"; namespace { +template class subtree_visitor : public parser::const_visitor { public: - subtree_visitor(corpus::document& d) : doc(d) + using feature_map = typename subtree_featurizer::feature_map; + + subtree_visitor(feature_map& fm) : counts(fm) { // nothing } @@ -24,33 +28,37 @@ class subtree_visitor : public parser::const_visitor { auto rep = "(" + static_cast(in.category()); - in.each_child([&](const parser::node* child) - { - rep += " (" + static_cast(child->category()) + ")"; - child->accept(*this); - }); + in.each_child( + [&](const parser::node* child) + { + rep += " (" + static_cast(child->category()) + ")"; + child->accept(*this); + }); rep += ")"; - doc.increment(subtree_featurizer::id + "-" + rep, 1); + counts[subtree_featurizer::id + "-" + rep] += 1; } void operator()(const parser::leaf_node& ln) override { auto rep = "(" + static_cast(ln.category()) + ")"; - doc.increment(subtree_featurizer::id + "-" + rep, 1); + counts[subtree_featurizer::id + "-" + rep] += 1; } - private: - corpus::document& doc; + feature_map& counts; }; } -void subtree_featurizer::tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const +template +void subtree_featurizer::tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const { - subtree_visitor vtor{doc}; + subtree_visitor vtor{counts}; tree.visit(vtor); } + +template class subtree_featurizer; +template class subtree_featurizer; } } diff --git a/src/parser/analyzers/featurizers/tag_featurizer.cpp b/src/parser/analyzers/featurizers/tag_featurizer.cpp index eaeacdb53..fe720a5f0 100644 --- a/src/parser/analyzers/featurizers/tag_featurizer.cpp +++ b/src/parser/analyzers/featurizers/tag_featurizer.cpp @@ -8,23 +8,26 @@ namespace meta namespace analyzers { -const std::string tag_featurizer::id = "tag"; +template +const std::string tag_featurizer::id = "tag"; namespace { +template class tag_visitor : public parser::const_visitor { public: - tag_visitor(corpus::document& d) : doc(d) + using feature_map = typename tag_featurizer::feature_map; + + tag_visitor(feature_map& fm) : counts(fm) { // nothing } void operator()(const parser::internal_node& in) override { - doc.increment(tag_featurizer::id + "-" - + static_cast(in.category()), - 1); + counts[tag_featurizer::id + "-" + + static_cast(in.category())] += 1; in.each_child([&](const parser::node* child) { child->accept(*this); @@ -33,21 +36,24 @@ class tag_visitor : public parser::const_visitor void operator()(const parser::leaf_node& ln) override { - doc.increment(tag_featurizer::id + "-" - + static_cast(ln.category()), - 1); + counts[tag_featurizer::id + "-" + + static_cast(ln.category())] += 1; } private: - corpus::document& doc; + feature_map& counts; }; } -void tag_featurizer::tree_tokenize(corpus::document& doc, - const parser::parse_tree& tree) const +template +void tag_featurizer::tree_tokenize(const parser::parse_tree& tree, + feature_map& counts) const { - tag_visitor vtor{doc}; + tag_visitor vtor{counts}; tree.visit(vtor); } + +template class tag_featurizer; +template class tag_featurizer; } } diff --git a/src/parser/analyzers/tree_analyzer.cpp b/src/parser/analyzers/tree_analyzer.cpp index 66a43f52a..2fea07e7f 100644 --- a/src/parser/analyzers/tree_analyzer.cpp +++ b/src/parser/analyzers/tree_analyzer.cpp @@ -40,13 +40,14 @@ tree_analyzer::tree_analyzer(const tree_analyzer& other) } template -void tree_analyzer::add(std::unique_ptr featurizer) +void tree_analyzer::add(std::unique_ptr> featurizer) { featurizers_->emplace_back(std::move(featurizer)); } template -void tree_analyzer::tokenize(corpus::document& doc) +void tree_analyzer::tokenize(const corpus::document& doc, + feature_map& counts) { stream_->set_content(get_content(doc)); @@ -64,7 +65,7 @@ void tree_analyzer::tokenize(corpus::document& doc) tagger_->tag(seq); auto tree = parser_->parse(seq); for (const auto& featurizer : *featurizers_) - featurizer->tree_tokenize(doc, tree); + featurizer->tree_tokenize(tree, counts); } else { @@ -96,7 +97,7 @@ std::unique_ptr> *parser_prefix); for (const auto& feat : feat_arr->array_of()) - ana->add(featurizer_factory::get().create(feat->get())); + ana->add(featurizer_factory::get().create(feat->get())); return std::move(ana); } diff --git a/src/sequence/analyzers/ngram_pos_analyzer.cpp b/src/sequence/analyzers/ngram_pos_analyzer.cpp index 53906db37..9c750611f 100644 --- a/src/sequence/analyzers/ngram_pos_analyzer.cpp +++ b/src/sequence/analyzers/ngram_pos_analyzer.cpp @@ -46,7 +46,8 @@ ngram_pos_analyzer::ngram_pos_analyzer(const ngram_pos_analyzer& other) } template -void ngram_pos_analyzer::tokenize(corpus::document& doc) +void ngram_pos_analyzer::tokenize(const corpus::document& doc, + feature_map& counts) { // first, get tokens stream_->set_content(get_content(doc)); @@ -86,7 +87,7 @@ void ngram_pos_analyzer::tokenize(corpus::document& doc) combined = next + "_" + combined; } - doc.increment(combined, 1); + counts[combined] += 1; } } } diff --git a/src/test/analyzer_test.cpp b/src/test/analyzer_test.cpp index 9dd3b4089..d94f70d7a 100644 --- a/src/test/analyzer_test.cpp +++ b/src/test/analyzer_test.cpp @@ -27,11 +27,19 @@ std::unique_ptr make_filter() template void check_analyzer_expected(Analyzer& ana, corpus::document doc, - uint64_t num_unique, uint64_t length) + uint64_t num_unique, uint64_t length) { - ana.tokenize(doc); - ASSERT_EQUAL(doc.counts().size(), num_unique); - ASSERT_EQUAL(doc.length(), length); + auto counts = ana.analyze(doc); + ASSERT_EQUAL(counts.size(), num_unique); + + auto total = std::accumulate( + counts.begin(), counts.end(), uint64_t{0}, + [](uint64_t acc, + const typename Analyzer::feature_map::value_type& count) + { + return acc + count.second; + }); + ASSERT_EQUAL(total, length); ASSERT_EQUAL(doc.id(), 47ul); } @@ -44,23 +52,26 @@ int content_tokenize() doc.content(content); int num_failed = 0; - num_failed += testing::run_test("content-unigram-word-analyzer", [&]() - { - analyzers::ngram_word_analyzer tok{1, make_filter()}; - check_analyzer_expected(tok, doc, 6, 8); - }); - - num_failed += testing::run_test("content-bigram-word-analyzer", [&]() - { - analyzers::ngram_word_analyzer tok{2, make_filter()}; - check_analyzer_expected(tok, doc, 6, 7); - }); - - num_failed += testing::run_test("content-trigram-word-analyzer", [&]() - { - analyzers::ngram_word_analyzer tok{3, make_filter()}; - check_analyzer_expected(tok, doc, 6, 6); - }); + num_failed += testing::run_test( + "content-unigram-word-analyzer", [&]() + { + analyzers::ngram_word_analyzer tok{1, make_filter()}; + check_analyzer_expected(tok, doc, 6, 8); + }); + + num_failed += testing::run_test( + "content-bigram-word-analyzer", [&]() + { + analyzers::ngram_word_analyzer tok{2, make_filter()}; + check_analyzer_expected(tok, doc, 6, 7); + }); + + num_failed += testing::run_test( + "content-trigram-word-analyzer", [&]() + { + analyzers::ngram_word_analyzer tok{3, make_filter()}; + check_analyzer_expected(tok, doc, 6, 6); + }); return num_failed; } @@ -71,23 +82,26 @@ int file_tokenize() corpus::document doc{doc_id{47}}; doc.content(filesystem::file_text("../data/sample-document.txt")); - num_failed += testing::run_test("file-unigram-word-analyzer", [&]() - { - analyzers::ngram_word_analyzer tok{1, make_filter()}; - check_analyzer_expected(tok, doc, 93, 168); - }); - - num_failed += testing::run_test("file-bigram-word-analyzer", [&]() - { - analyzers::ngram_word_analyzer tok{2, make_filter()}; - check_analyzer_expected(tok, doc, 140, 167); - }); - - num_failed += testing::run_test("file-trigram-word-analyzer", [&]() - { - analyzers::ngram_word_analyzer tok{3, make_filter()}; - check_analyzer_expected(tok, doc, 159, 166); - }); + num_failed += testing::run_test( + "file-unigram-word-analyzer", [&]() + { + analyzers::ngram_word_analyzer tok{1, make_filter()}; + check_analyzer_expected(tok, doc, 93, 168); + }); + + num_failed += testing::run_test( + "file-bigram-word-analyzer", [&]() + { + analyzers::ngram_word_analyzer tok{2, make_filter()}; + check_analyzer_expected(tok, doc, 140, 167); + }); + + num_failed += testing::run_test( + "file-trigram-word-analyzer", [&]() + { + analyzers::ngram_word_analyzer tok{3, make_filter()}; + check_analyzer_expected(tok, doc, 159, 166); + }); return num_failed; } diff --git a/src/tools/profile.cpp b/src/tools/profile.cpp index 8cf7171a3..e5017243a 100644 --- a/src/tools/profile.cpp +++ b/src/tools/profile.cpp @@ -285,10 +285,10 @@ void freq(const std::string& file, const cpptoml::table&, uint16_t n) corpus::document doc; doc.content(filesystem::file_text(file)); - ana.tokenize(doc); + auto counts = ana.analyze(doc); using pair_t = std::pair; - std::vector sorted(doc.counts().begin(), doc.counts().end()); + std::vector sorted(counts.begin(), counts.end()); std::sort(sorted.begin(), sorted.end(), [](const pair_t& a, const pair_t& b) { return a.second > b.second; From 6ea9330ac09bc7738db9b48df47c0f815bc99f22 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 5 Sep 2015 21:50:11 -0500 Subject: [PATCH 241/481] Fix default argument for ranker::score(). GCC 4.8.1 would get an ICE on this, and in retrospect I don't really know why the other compilers are happy with this. --- include/index/ranker/ranker.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/index/ranker/ranker.h b/include/index/ranker/ranker.h index 12249d134..12ab857f2 100644 --- a/include/index/ranker/ranker.h +++ b/include/index/ranker/ranker.h @@ -136,7 +136,7 @@ class ranker * @param filter A filtering function to apply to each doc_id; returns true * if the document should be included in results */ - template + template std::vector score(inverted_index& idx, ForwardIterator begin, ForwardIterator end, uint64_t num_results = 10, FilterFunction&& filter = [](doc_id) From 3d803035f07b63ecf9a672a7f578deaaea8a94d1 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 5 Sep 2015 22:15:19 -0500 Subject: [PATCH 242/481] Revert to std::function<> to support GCC 4.8.1. It'd be nice to fix this, but the workaround of ifdefs and another cxx_source_compiles in CMake is just too ugly for me. --- include/index/ranker/ranker.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/index/ranker/ranker.h b/include/index/ranker/ranker.h index 12ab857f2..712a2a144 100644 --- a/include/index/ranker/ranker.h +++ b/include/index/ranker/ranker.h @@ -136,13 +136,14 @@ class ranker * @param filter A filtering function to apply to each doc_id; returns true * if the document should be included in results */ - template + template std::vector score(inverted_index& idx, ForwardIterator begin, ForwardIterator end, - uint64_t num_results = 10, FilterFunction&& filter = [](doc_id) - { - return true; - }) + uint64_t num_results = 10, + std::function filter = [](doc_id) + { + return true; + }) { detail::ranker_context ctx{idx, begin, end, filter}; return rank(ctx, num_results, filter); From 7aea77f8753411d8e459ae333b5de932e10fdb6b Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 5 Sep 2015 22:27:42 -0500 Subject: [PATCH 243/481] Fix type for filter function (use the typedef). --- include/index/ranker/ranker.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/index/ranker/ranker.h b/include/index/ranker/ranker.h index 712a2a144..d975784fc 100644 --- a/include/index/ranker/ranker.h +++ b/include/index/ranker/ranker.h @@ -140,7 +140,7 @@ class ranker std::vector score(inverted_index& idx, ForwardIterator begin, ForwardIterator end, uint64_t num_results = 10, - std::function filter = [](doc_id) + const filter_function_type& filter = [](doc_id) { return true; }) From 71927676e5da105ec804ad9c4937de44a4054379 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 5 Sep 2015 22:56:07 -0500 Subject: [PATCH 244/481] Add a conditional compilation fix for GCC 4.8.1 ICE. Nothing else fixes this that I've tried. GCC 4.8.1 just really hates having a lambda function passed as the default argument to anything that is a member function template. I don't know why those specific conditions cause it, but eliminating any of them causes it to not ICE. ...but here we can't really remove any of those restrictions, so the easiest thing is to just not pass a lambda for that compiler verison. --- CMakeLists.txt | 21 +++++++++++++++++++++ include/index/ranker/ranker.h | 29 +++++++++++++++++++++-------- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index baf89bd6a..9c6ee9ab7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -261,6 +261,27 @@ if (META_HAS_NONEMPTY_HASH_SUPPORT) -DMETA_HAS_NONEMPTY_HASH_SUPPORT) endif() +# work around a bug in gcc 4.8.1 where having a lambda as a default +# argument to a member function template causes an internal compiler error +check_cxx_source_compiles(" +struct h { + template + void foo(Function&& fun = [](){}) { + fun(); + } +}; + +int main() { + h b; + b.foo(); + return 0; +}" META_HAS_LAMBDA_AS_DEFAULT_ARGUMENT) + +if (META_HAS_LAMBDA_AS_DEFAULT_ARGUMENT) + target_compile_definitions(meta-definitions INTERFACE + -DMETA_HAS_MEM_FN_TEMPLATE_LAMBDA_DEFAULT_ARGUMENT) +endif() + if(ICU_VERSION VERSION_LESS "4.4") target_compile_definitions(meta-definitions INTERFACE -DMETA_ICU_NO_TEMP_SUBSTRING) diff --git a/include/index/ranker/ranker.h b/include/index/ranker/ranker.h index d975784fc..d80dbb693 100644 --- a/include/index/ranker/ranker.h +++ b/include/index/ranker/ranker.h @@ -127,6 +127,13 @@ class ranker public: using filter_function_type = std::function; +#ifndef META_HAS_MEM_FN_TEMPLATE_LAMBDA_DEFAULT_ARGUMENT + static bool passthrough(doc_id) + { + return true; + } +#endif + /** * @param idx The index this ranker is operating on * @param begin A forward iterator to the beginning of the term @@ -136,14 +143,20 @@ class ranker * @param filter A filtering function to apply to each doc_id; returns true * if the document should be included in results */ - template - std::vector - score(inverted_index& idx, ForwardIterator begin, ForwardIterator end, - uint64_t num_results = 10, - const filter_function_type& filter = [](doc_id) - { - return true; - }) + template + std::vector score(inverted_index& idx, ForwardIterator begin, + ForwardIterator end, + uint64_t num_results = 10, +#if META_HAS_MEM_FN_TEMPLATE_LAMBDA_DEFAULT_ARGUMENT + Function&& filter = + [](doc_id) + { + return true; + } +#else + Function&& filter = passthrough +#endif + ) { detail::ranker_context ctx{idx, begin, end, filter}; return rank(ctx, num_results, filter); From af5ca7b53e1b263e2a0aabb0581f39abb8935bc9 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 6 Sep 2015 17:01:22 -0500 Subject: [PATCH 245/481] Add brief comment above extern template declarations. --- include/analyzers/analyzer.h | 1 + include/analyzers/analyzer_factory.h | 1 + include/analyzers/multi_analyzer.h | 1 + include/analyzers/ngram/ngram_analyzer.h | 1 + include/analyzers/ngram/ngram_word_analyzer.h | 3 +++ include/parser/analyzers/featurizers/branch_featurizer.h | 1 + include/parser/analyzers/featurizers/depth_featurizer.h | 1 + .../parser/analyzers/featurizers/semi_skeleton_featurizer.h | 1 + include/parser/analyzers/featurizers/skeleton_featurizer.h | 1 + include/parser/analyzers/featurizers/subtree_featurizer.h | 1 + include/parser/analyzers/featurizers/tag_featurizer.h | 1 + include/parser/analyzers/tree_analyzer.h | 3 +++ include/sequence/analyzers/ngram_pos_analyzer.h | 3 +++ 13 files changed, 19 insertions(+) diff --git a/include/analyzers/analyzer.h b/include/analyzers/analyzer.h index e8c93fab0..f80512523 100644 --- a/include/analyzers/analyzer.h +++ b/include/analyzers/analyzer.h @@ -109,6 +109,7 @@ class analyzer_exception : public std::runtime_error template std::unique_ptr> load(const cpptoml::table& config); +// declare the valid instantiations of the load function above extern template std::unique_ptr> load(const cpptoml::table& config); extern template std::unique_ptr> diff --git a/include/analyzers/analyzer_factory.h b/include/analyzers/analyzer_factory.h index db1e97778..955fe710b 100644 --- a/include/analyzers/analyzer_factory.h +++ b/include/analyzers/analyzer_factory.h @@ -57,6 +57,7 @@ class analyzer_factory std::unordered_map methods_; }; +// declare the valid instantiations for this factory extern template class analyzer_factory; extern template class analyzer_factory; diff --git a/include/analyzers/multi_analyzer.h b/include/analyzers/multi_analyzer.h index 42002b425..d0f64d1e7 100644 --- a/include/analyzers/multi_analyzer.h +++ b/include/analyzers/multi_analyzer.h @@ -60,6 +60,7 @@ class multi_analyzer : public util::clonable, multi_analyzer> std::vector>> analyzers_; }; +// declare the valid instantiations for this analyzer extern template class multi_analyzer; extern template class multi_analyzer; } diff --git a/include/analyzers/ngram/ngram_analyzer.h b/include/analyzers/ngram/ngram_analyzer.h index 484faf584..281b492c1 100644 --- a/include/analyzers/ngram/ngram_analyzer.h +++ b/include/analyzers/ngram/ngram_analyzer.h @@ -52,6 +52,7 @@ class ngram_analyzer : public analyzer uint16_t n_val_; }; +// declare the valid instantiations for this analyzer extern template class ngram_analyzer; extern template class ngram_analyzer; } diff --git a/include/analyzers/ngram/ngram_word_analyzer.h b/include/analyzers/ngram/ngram_word_analyzer.h index bf800b19c..b794639c8 100644 --- a/include/analyzers/ngram/ngram_word_analyzer.h +++ b/include/analyzers/ngram/ngram_word_analyzer.h @@ -83,8 +83,11 @@ struct analyzer_traits> const cpptoml::table&); }; +// declare the valid instantiations for this analyzer extern template class ngram_word_analyzer; extern template class ngram_word_analyzer; + +// declare the valid instantiations for this analyzer's trait class extern template struct analyzer_traits>; extern template struct analyzer_traits>; } diff --git a/include/parser/analyzers/featurizers/branch_featurizer.h b/include/parser/analyzers/featurizers/branch_featurizer.h index 75e98b0fe..7679d5ace 100644 --- a/include/parser/analyzers/featurizers/branch_featurizer.h +++ b/include/parser/analyzers/featurizers/branch_featurizer.h @@ -40,6 +40,7 @@ class branch_featurizer const static std::string id; }; +// declare the valid instantiations for this featurizer extern template class branch_featurizer; extern template class branch_featurizer; } diff --git a/include/parser/analyzers/featurizers/depth_featurizer.h b/include/parser/analyzers/featurizers/depth_featurizer.h index a066d58aa..a0920fd5a 100644 --- a/include/parser/analyzers/featurizers/depth_featurizer.h +++ b/include/parser/analyzers/featurizers/depth_featurizer.h @@ -40,6 +40,7 @@ class depth_featurizer const static std::string id; }; +// declare the valid instantiations for this featurizer extern template class depth_featurizer; extern template class depth_featurizer; } diff --git a/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h b/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h index 5a3591a8b..dd3a30e64 100644 --- a/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h +++ b/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h @@ -41,6 +41,7 @@ class semi_skeleton_featurizer const static std::string id; }; +// declare the valid instantiations for this featurizer extern template class semi_skeleton_featurizer; extern template class semi_skeleton_featurizer; } diff --git a/include/parser/analyzers/featurizers/skeleton_featurizer.h b/include/parser/analyzers/featurizers/skeleton_featurizer.h index 51d2f38f2..6a1c93066 100644 --- a/include/parser/analyzers/featurizers/skeleton_featurizer.h +++ b/include/parser/analyzers/featurizers/skeleton_featurizer.h @@ -40,6 +40,7 @@ class skeleton_featurizer const static std::string id; }; +// declare the valid instantiations for this featurizer extern template class skeleton_featurizer; extern template class skeleton_featurizer; } diff --git a/include/parser/analyzers/featurizers/subtree_featurizer.h b/include/parser/analyzers/featurizers/subtree_featurizer.h index 025853d3d..b81195788 100644 --- a/include/parser/analyzers/featurizers/subtree_featurizer.h +++ b/include/parser/analyzers/featurizers/subtree_featurizer.h @@ -41,6 +41,7 @@ class subtree_featurizer const static std::string id; }; +// declare the valid instantiations for this featurizer extern template class subtree_featurizer; extern template class subtree_featurizer; } diff --git a/include/parser/analyzers/featurizers/tag_featurizer.h b/include/parser/analyzers/featurizers/tag_featurizer.h index 4c1bfe2d4..6f14eaf9f 100644 --- a/include/parser/analyzers/featurizers/tag_featurizer.h +++ b/include/parser/analyzers/featurizers/tag_featurizer.h @@ -40,6 +40,7 @@ class tag_featurizer const static std::string id; }; +// declare the valid instantiations for this featurizer extern template class tag_featurizer; extern template class tag_featurizer; } diff --git a/include/parser/analyzers/tree_analyzer.h b/include/parser/analyzers/tree_analyzer.h index 1f54212c2..9051037d1 100644 --- a/include/parser/analyzers/tree_analyzer.h +++ b/include/parser/analyzers/tree_analyzer.h @@ -117,8 +117,11 @@ struct analyzer_traits> const cpptoml::table&); }; +// declare the valid instantiations of this analyzer extern template class tree_analyzer; extern template class tree_analyzer; + +// declare the valid instantiations of this analyzer's traits class extern template struct analyzer_traits>; extern template struct analyzer_traits>; } diff --git a/include/sequence/analyzers/ngram_pos_analyzer.h b/include/sequence/analyzers/ngram_pos_analyzer.h index 272d08275..ee3b3e6b1 100644 --- a/include/sequence/analyzers/ngram_pos_analyzer.h +++ b/include/sequence/analyzers/ngram_pos_analyzer.h @@ -100,8 +100,11 @@ struct analyzer_traits> const cpptoml::table&); }; +// declare the valid instantiations for this analyzer extern template class ngram_pos_analyzer; extern template class ngram_pos_analyzer; + +// declare the valid instantiations for this analyzer's trait class extern template struct analyzer_traits>; extern template struct analyzer_traits>; } From e136c32999685a9c492378b022f3e614861a8948 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 6 Sep 2015 17:33:43 -0500 Subject: [PATCH 246/481] Switch to util::string_view for identifiers for factories. This saves a bunch of heap allocations (one for every class that can be created via a factory) and allows re-use of the same memory between e.g. ngram_word_analyzer and ngram_word_analyzer, since it's referring to the same exact static C-string now. --- include/analyzers/filters/alpha_filter.h | 3 ++- include/analyzers/filters/empty_sentence_filter.h | 3 ++- include/analyzers/filters/english_normalizer.h | 3 ++- include/analyzers/filters/icu_filter.h | 2 +- include/analyzers/filters/length_filter.h | 2 +- include/analyzers/filters/list_filter.h | 2 +- include/analyzers/filters/lowercase_filter.h | 3 ++- include/analyzers/filters/porter2_stemmer.h | 3 ++- include/analyzers/filters/ptb_normalizer.h | 3 ++- include/analyzers/filters/sentence_boundary.h | 2 +- include/analyzers/ngram/ngram_word_analyzer.h | 2 +- include/analyzers/tokenizers/character_tokenizer.h | 3 ++- include/analyzers/tokenizers/icu_tokenizer.h | 2 +- include/analyzers/tokenizers/whitespace_tokenizer.h | 3 ++- include/classify/classifier/dual_perceptron.h | 2 +- include/classify/classifier/knn.h | 2 +- include/classify/classifier/logistic_regression.h | 2 +- include/classify/classifier/naive_bayes.h | 2 +- include/classify/classifier/nearest_centroid.h | 2 +- include/classify/classifier/one_vs_all.h | 2 +- include/classify/classifier/one_vs_one.h | 2 +- include/classify/classifier/sgd.h | 2 +- include/classify/classifier/svm_wrapper.h | 2 +- include/classify/classifier/winnow.h | 2 +- include/classify/loss/hinge.h | 3 ++- include/classify/loss/huber.h | 3 ++- include/classify/loss/least_squares.h | 3 ++- include/classify/loss/logistic.h | 3 ++- include/classify/loss/modified_huber.h | 3 ++- include/classify/loss/perceptron.h | 3 ++- include/classify/loss/smooth_hinge.h | 3 ++- include/classify/loss/squared_hinge.h | 3 ++- include/index/ranker/absolute_discount.h | 2 +- include/index/ranker/dirichlet_prior.h | 2 +- include/index/ranker/jelinek_mercer.h | 2 +- include/index/ranker/lm_ranker.h | 3 ++- include/index/ranker/okapi_bm25.h | 2 +- include/index/ranker/pivoted_length.h | 2 +- .../parser/analyzers/featurizers/branch_featurizer.h | 3 ++- .../parser/analyzers/featurizers/depth_featurizer.h | 3 ++- .../parser/analyzers/featurizers/featurizer_factory.h | 3 --- .../analyzers/featurizers/semi_skeleton_featurizer.h | 3 ++- .../parser/analyzers/featurizers/skeleton_featurizer.h | 3 ++- .../parser/analyzers/featurizers/subtree_featurizer.h | 3 ++- include/parser/analyzers/featurizers/tag_featurizer.h | 3 ++- include/parser/analyzers/tree_analyzer.h | 2 +- include/sequence/analyzers/ngram_pos_analyzer.h | 2 +- include/util/factory.h | 10 ++++++---- src/analyzers/filters/alpha_filter.cpp | 2 +- src/analyzers/filters/empty_sentence_filter.cpp | 2 +- src/analyzers/filters/english_normalizer.cpp | 2 +- src/analyzers/filters/icu_filter.cpp | 2 +- src/analyzers/filters/length_filter.cpp | 2 +- src/analyzers/filters/list_filter.cpp | 2 +- src/analyzers/filters/lowercase_filter.cpp | 2 +- src/analyzers/filters/porter2_stemmer.cpp | 2 +- src/analyzers/filters/ptb_normalizer.cpp | 2 +- src/analyzers/filters/sentence_boundary.cpp | 2 +- src/analyzers/ngram/ngram_word_analyzer.cpp | 2 +- src/analyzers/tokenizers/character_tokenizer.cpp | 2 +- src/analyzers/tokenizers/icu_tokenizer.cpp | 2 +- src/analyzers/tokenizers/whitespace_tokenizer.cpp | 2 +- src/classify/classifier/dual_perceptron.cpp | 2 +- src/classify/classifier/knn.cpp | 2 +- src/classify/classifier/logistic_regression.cpp | 2 +- src/classify/classifier/naive_bayes.cpp | 2 +- src/classify/classifier/nearest_centroid.cpp | 2 +- src/classify/classifier/one_vs_all.cpp | 2 +- src/classify/classifier/one_vs_one.cpp | 2 +- src/classify/classifier/sgd.cpp | 2 +- src/classify/classifier/svm_wrapper.cpp | 2 +- src/classify/classifier/winnow.cpp | 2 +- src/classify/loss/hinge.cpp | 2 +- src/classify/loss/huber.cpp | 2 +- src/classify/loss/least_squares.cpp | 2 +- src/classify/loss/logistic.cpp | 2 +- src/classify/loss/modified_huber.cpp | 2 +- src/classify/loss/perceptron.cpp | 2 +- src/classify/loss/smooth_hinge.cpp | 2 +- src/classify/loss/squared_hinge.cpp | 2 +- src/index/ranker/absolute_discount.cpp | 2 +- src/index/ranker/dirichlet_prior.cpp | 2 +- src/index/ranker/jelinek_mercer.cpp | 2 +- src/index/ranker/lm_ranker.cpp | 2 +- src/index/ranker/okapi_bm25.cpp | 2 +- src/index/ranker/pivoted_length.cpp | 2 +- src/parser/analyzers/featurizers/branch_featurizer.cpp | 2 +- src/parser/analyzers/featurizers/depth_featurizer.cpp | 2 +- .../analyzers/featurizers/semi_skeleton_featurizer.cpp | 7 ++++--- .../analyzers/featurizers/skeleton_featurizer.cpp | 2 +- .../analyzers/featurizers/subtree_featurizer.cpp | 6 +++--- src/parser/analyzers/featurizers/tag_featurizer.cpp | 6 +++--- src/parser/analyzers/tree_analyzer.cpp | 2 +- src/sequence/analyzers/ngram_pos_analyzer.cpp | 2 +- 94 files changed, 128 insertions(+), 105 deletions(-) diff --git a/include/analyzers/filters/alpha_filter.h b/include/analyzers/filters/alpha_filter.h index 2931b8464..581e78d1c 100644 --- a/include/analyzers/filters/alpha_filter.h +++ b/include/analyzers/filters/alpha_filter.h @@ -12,6 +12,7 @@ #include "analyzers/token_stream.h" #include "util/clonable.h" #include "util/optional.h" +#include "util/string_view.h" namespace meta { @@ -59,7 +60,7 @@ class alpha_filter : public util::clonable operator bool() const override; /// Identifier for this filter - const static std::string id; + const static util::string_view id; private: /** diff --git a/include/analyzers/filters/empty_sentence_filter.h b/include/analyzers/filters/empty_sentence_filter.h index a219a6877..0e998eb1e 100644 --- a/include/analyzers/filters/empty_sentence_filter.h +++ b/include/analyzers/filters/empty_sentence_filter.h @@ -12,6 +12,7 @@ #include "analyzers/token_stream.h" #include "util/clonable.h" #include "util/optional.h" +#include "util/string_view.h" namespace meta { @@ -61,7 +62,7 @@ class empty_sentence_filter operator bool() const override; /// Identifier for this filter - const static std::string id; + const static util::string_view id; private: /** diff --git a/include/analyzers/filters/english_normalizer.h b/include/analyzers/filters/english_normalizer.h index 3544288e9..ff084446f 100644 --- a/include/analyzers/filters/english_normalizer.h +++ b/include/analyzers/filters/english_normalizer.h @@ -14,6 +14,7 @@ #include "analyzers/token_stream.h" #include "util/clonable.h" #include "util/optional.h" +#include "util/string_view.h" namespace meta { @@ -65,7 +66,7 @@ class english_normalizer operator bool() const override; /// Identifier for this filter - const static std::string id; + const static util::string_view id; private: /** diff --git a/include/analyzers/filters/icu_filter.h b/include/analyzers/filters/icu_filter.h index 03e0a5c53..a53716db2 100644 --- a/include/analyzers/filters/icu_filter.h +++ b/include/analyzers/filters/icu_filter.h @@ -73,7 +73,7 @@ class icu_filter : public util::clonable operator bool() const override; /// Identifier for this filter - const static std::string id; + const static util::string_view id; private: /** diff --git a/include/analyzers/filters/length_filter.h b/include/analyzers/filters/length_filter.h index 4feed011b..6b94f2af4 100644 --- a/include/analyzers/filters/length_filter.h +++ b/include/analyzers/filters/length_filter.h @@ -76,7 +76,7 @@ class length_filter : public util::clonable operator bool() const override; /// Identifier for this filter - const static std::string id; + const static util::string_view id; private: /** diff --git a/include/analyzers/filters/list_filter.h b/include/analyzers/filters/list_filter.h index d03579f54..7793ba684 100644 --- a/include/analyzers/filters/list_filter.h +++ b/include/analyzers/filters/list_filter.h @@ -91,7 +91,7 @@ class list_filter : public util::clonable operator bool() const override; /// Identifier for this filter - const static std::string id; + const static util::string_view id; private: /** diff --git a/include/analyzers/filters/lowercase_filter.h b/include/analyzers/filters/lowercase_filter.h index d00fd1aab..14998a943 100644 --- a/include/analyzers/filters/lowercase_filter.h +++ b/include/analyzers/filters/lowercase_filter.h @@ -12,6 +12,7 @@ #include #include "analyzers/token_stream.h" #include "util/clonable.h" +#include "util/string_view.h" namespace meta { @@ -59,7 +60,7 @@ class lowercase_filter : public util::clonable operator bool() const override; /// Identifier for this filter - const static std::string id; + const static util::string_view id; private: /// The stream to read tokens from. diff --git a/include/analyzers/filters/porter2_stemmer.h b/include/analyzers/filters/porter2_stemmer.h index 0c7fced33..c1bf18c69 100644 --- a/include/analyzers/filters/porter2_stemmer.h +++ b/include/analyzers/filters/porter2_stemmer.h @@ -13,6 +13,7 @@ #include "analyzers/token_stream.h" #include "util/clonable.h" #include "util/optional.h" +#include "util/string_view.h" namespace meta { @@ -61,7 +62,7 @@ class porter2_stemmer : public util::clonable operator bool() const override; /// Identifier for this filter - const static std::string id; + const static util::string_view id; private: /** diff --git a/include/analyzers/filters/ptb_normalizer.h b/include/analyzers/filters/ptb_normalizer.h index d51a7f8d8..aaa66d098 100644 --- a/include/analyzers/filters/ptb_normalizer.h +++ b/include/analyzers/filters/ptb_normalizer.h @@ -13,6 +13,7 @@ #include #include "analyzers/token_stream.h" #include "util/clonable.h" +#include "util/string_view.h" namespace meta { @@ -62,7 +63,7 @@ class ptb_normalizer : public util::clonable operator bool() const override; /// Identifier for this filter - const static std::string id; + const static util::string_view id; private: /** diff --git a/include/analyzers/filters/sentence_boundary.h b/include/analyzers/filters/sentence_boundary.h index 1f2aa9a6c..a293b8301 100644 --- a/include/analyzers/filters/sentence_boundary.h +++ b/include/analyzers/filters/sentence_boundary.h @@ -82,7 +82,7 @@ class sentence_boundary : public util::clonable operator bool() const override; /// Identifier for this filter - const static std::string id; + const static util::string_view id; private: /** diff --git a/include/analyzers/ngram/ngram_word_analyzer.h b/include/analyzers/ngram/ngram_word_analyzer.h index b794639c8..e0c905914 100644 --- a/include/analyzers/ngram/ngram_word_analyzer.h +++ b/include/analyzers/ngram/ngram_word_analyzer.h @@ -58,7 +58,7 @@ class ngram_word_analyzer ngram_word_analyzer(const ngram_word_analyzer& other); /// Identifier for this analyzer. - const static std::string id; + const static util::string_view id; private: /** diff --git a/include/analyzers/tokenizers/character_tokenizer.h b/include/analyzers/tokenizers/character_tokenizer.h index 15f0773fe..7c3826778 100644 --- a/include/analyzers/tokenizers/character_tokenizer.h +++ b/include/analyzers/tokenizers/character_tokenizer.h @@ -11,6 +11,7 @@ #include "analyzers/token_stream.h" #include "util/clonable.h" +#include "util/string_view.h" namespace meta { @@ -58,7 +59,7 @@ class character_tokenizer operator bool() const override; /// Identifier for this tokenizer. - const static std::string id; + const static util::string_view id; private: /// The buffered string content for this tokenizer diff --git a/include/analyzers/tokenizers/icu_tokenizer.h b/include/analyzers/tokenizers/icu_tokenizer.h index 3706ec8c7..8019fd789 100644 --- a/include/analyzers/tokenizers/icu_tokenizer.h +++ b/include/analyzers/tokenizers/icu_tokenizer.h @@ -108,7 +108,7 @@ class icu_tokenizer : public util::clonable operator bool() const override; /// Identifier for this tokenizer - const static std::string id; + const static util::string_view id; private: /// Forward declaration of the impl diff --git a/include/analyzers/tokenizers/whitespace_tokenizer.h b/include/analyzers/tokenizers/whitespace_tokenizer.h index be7ca7fe3..da1ace8b3 100644 --- a/include/analyzers/tokenizers/whitespace_tokenizer.h +++ b/include/analyzers/tokenizers/whitespace_tokenizer.h @@ -11,6 +11,7 @@ #include "analyzers/token_stream.h" #include "util/clonable.h" +#include "util/string_view.h" namespace meta { @@ -60,7 +61,7 @@ class whitespace_tokenizer : public util::clonable #include "classify/loss/loss_function.h" +#include "util/string_view.h" namespace meta { @@ -33,7 +34,7 @@ struct hinge : public loss_function /** * The identifier for this loss function. */ - const static std::string id; + const static util::string_view id; double loss(double prediction, double expected) const override; double derivative(double prediction, double expected) const override; diff --git a/include/classify/loss/huber.h b/include/classify/loss/huber.h index ae1ccbdf8..747e6bce1 100644 --- a/include/classify/loss/huber.h +++ b/include/classify/loss/huber.h @@ -11,6 +11,7 @@ #include #include "classify/loss/loss_function.h" +#include "util/string_view.h" namespace meta { @@ -30,7 +31,7 @@ struct huber : public loss_function /** * The identifier for this loss function. */ - const static std::string id; + const static util::string_view id; double loss(double prediction, double expected) const override; double derivative(double prediction, double expected) const override; diff --git a/include/classify/loss/least_squares.h b/include/classify/loss/least_squares.h index 159573997..edb7424db 100644 --- a/include/classify/loss/least_squares.h +++ b/include/classify/loss/least_squares.h @@ -10,6 +10,7 @@ #define META_CLASSIFY_LEAST_SQUARES_LOSS_H_ #include "classify/loss/loss_function.h" +#include "util/string_view.h" namespace meta { @@ -28,7 +29,7 @@ struct least_squares : public loss_function /** * The identifier for this loss function. */ - const static std::string id; + const static util::string_view id; double loss(double prediction, double expected) const override; double derivative(double prediction, double expected) const override; diff --git a/include/classify/loss/logistic.h b/include/classify/loss/logistic.h index 416dceb63..3634c0d7f 100644 --- a/include/classify/loss/logistic.h +++ b/include/classify/loss/logistic.h @@ -11,6 +11,7 @@ #include #include "classify/loss/loss_function.h" +#include "util/string_view.h" namespace meta { @@ -29,7 +30,7 @@ struct logistic : public loss_function /** * The identifier for this loss function. */ - const static std::string id; + const static util::string_view id; double loss(double prediction, double expected) const override; double derivative(double prediction, double expected) const override; diff --git a/include/classify/loss/modified_huber.h b/include/classify/loss/modified_huber.h index a8bf026a5..61396582d 100644 --- a/include/classify/loss/modified_huber.h +++ b/include/classify/loss/modified_huber.h @@ -10,6 +10,7 @@ #define META_CLASSIFY_MODIFIED_HUBER_LOSS_H_ #include "classify/loss/loss_function.h" +#include "util/string_view.h" namespace meta { @@ -29,7 +30,7 @@ struct modified_huber : public loss_function /** * The identifier for this loss function. */ - const static std::string id; + const static util::string_view id; double loss(double prediction, double expected) const override; double derivative(double prediction, double expected) const override; diff --git a/include/classify/loss/perceptron.h b/include/classify/loss/perceptron.h index 064a3e8b4..05062dabc 100644 --- a/include/classify/loss/perceptron.h +++ b/include/classify/loss/perceptron.h @@ -10,6 +10,7 @@ #define META_CLASSIFY_PERCEPTRON_LOSS_H_ #include "classify/loss/loss_function.h" +#include "util/string_view.h" namespace meta { @@ -28,7 +29,7 @@ struct perceptron : public loss_function /** * The identifier for this loss function. */ - const static std::string id; + const static util::string_view id; double loss(double prediction, double expected) const override; double derivative(double prediction, double expected) const override; diff --git a/include/classify/loss/smooth_hinge.h b/include/classify/loss/smooth_hinge.h index 25ec35211..d24224707 100644 --- a/include/classify/loss/smooth_hinge.h +++ b/include/classify/loss/smooth_hinge.h @@ -10,6 +10,7 @@ #define META_CLASSIFY_SMOOTH_HINGE_LOSS_H_ #include "classify/loss/loss_function.h" +#include "util/string_view.h" namespace meta { @@ -29,7 +30,7 @@ struct smooth_hinge : public loss_function /** * The identifier for this loss function. */ - const static std::string id; + const static util::string_view id; double loss(double prediction, double expected) const override; double derivative(double prediction, double expected) const override; diff --git a/include/classify/loss/squared_hinge.h b/include/classify/loss/squared_hinge.h index 1897d50d3..a20f21398 100644 --- a/include/classify/loss/squared_hinge.h +++ b/include/classify/loss/squared_hinge.h @@ -10,6 +10,7 @@ #define META_CLASSIFY_SQUARED_HINGE_LOSS_H_ #include "classify/loss/loss_function.h" +#include "util/string_view.h" namespace meta { @@ -28,7 +29,7 @@ struct squared_hinge : public loss_function /** * The identifier for this loss function. */ - const static std::string id; + const static util::string_view id; double loss(double prediction, double expected) const override; double derivative(double prediction, double expected) const override; diff --git a/include/index/ranker/absolute_discount.h b/include/index/ranker/absolute_discount.h index fb768c78e..83d352580 100644 --- a/include/index/ranker/absolute_discount.h +++ b/include/index/ranker/absolute_discount.h @@ -38,7 +38,7 @@ class absolute_discount : public language_model_ranker /** * The identifier of this ranker. */ - const static std::string id; + const static util::string_view id; /** * @param delta diff --git a/include/index/ranker/dirichlet_prior.h b/include/index/ranker/dirichlet_prior.h index 4058435ae..305f693ba 100644 --- a/include/index/ranker/dirichlet_prior.h +++ b/include/index/ranker/dirichlet_prior.h @@ -36,7 +36,7 @@ class dirichlet_prior : public language_model_ranker { public: /// Identifier for this ranker. - const static std::string id; + const static util::string_view id; /// Default value of mu const static constexpr float default_mu = 2000.0f; diff --git a/include/index/ranker/jelinek_mercer.h b/include/index/ranker/jelinek_mercer.h index a0203f51d..bbf0db166 100644 --- a/include/index/ranker/jelinek_mercer.h +++ b/include/index/ranker/jelinek_mercer.h @@ -38,7 +38,7 @@ class jelinek_mercer : public language_model_ranker { public: /// The identifier for this ranker. - const static std::string id; + const static util::string_view id; /// Default value of lambda const static constexpr float default_lambda = 0.7f; diff --git a/include/index/ranker/lm_ranker.h b/include/index/ranker/lm_ranker.h index 71ff86170..b25300b57 100644 --- a/include/index/ranker/lm_ranker.h +++ b/include/index/ranker/lm_ranker.h @@ -10,6 +10,7 @@ #define META_LM_RANKER_H_ #include "index/ranker/ranker.h" +#include "util/string_view.h" namespace meta { @@ -25,7 +26,7 @@ class language_model_ranker : public ranker { public: /// The identifier for this ranker. - const static std::string id; + const static util::string_view id; /** * @param sd diff --git a/include/index/ranker/okapi_bm25.h b/include/index/ranker/okapi_bm25.h index 93b371ef6..58f08ce5f 100644 --- a/include/index/ranker/okapi_bm25.h +++ b/include/index/ranker/okapi_bm25.h @@ -37,7 +37,7 @@ class okapi_bm25 : public ranker { public: /// The identifier for this ranker. - const static std::string id; + const static util::string_view id; /// Default k1, doc term smoothing const static constexpr float default_k1 = 1.2f; diff --git a/include/index/ranker/pivoted_length.h b/include/index/ranker/pivoted_length.h index 67a835d89..d7a3aa0db 100644 --- a/include/index/ranker/pivoted_length.h +++ b/include/index/ranker/pivoted_length.h @@ -37,7 +37,7 @@ class pivoted_length : public ranker { public: /// Identifier for this ranker. - const static std::string id; + const static util::string_view id; /// Default value of s parameter const static constexpr float default_s = 0.2f; diff --git a/include/parser/analyzers/featurizers/branch_featurizer.h b/include/parser/analyzers/featurizers/branch_featurizer.h index 7679d5ace..3fbb88607 100644 --- a/include/parser/analyzers/featurizers/branch_featurizer.h +++ b/include/parser/analyzers/featurizers/branch_featurizer.h @@ -12,6 +12,7 @@ #include "parser/analyzers/featurizers/tree_featurizer.h" #include "util/clonable.h" +#include "util/string_view.h" namespace meta { @@ -37,7 +38,7 @@ class branch_featurizer feature_map& counts) const override; /// Identifier for this featurizer - const static std::string id; + const static util::string_view id; }; // declare the valid instantiations for this featurizer diff --git a/include/parser/analyzers/featurizers/depth_featurizer.h b/include/parser/analyzers/featurizers/depth_featurizer.h index a0920fd5a..7bf127789 100644 --- a/include/parser/analyzers/featurizers/depth_featurizer.h +++ b/include/parser/analyzers/featurizers/depth_featurizer.h @@ -12,6 +12,7 @@ #include "parser/analyzers/featurizers/tree_featurizer.h" #include "util/clonable.h" +#include "util/string_view.h" namespace meta { @@ -37,7 +38,7 @@ class depth_featurizer feature_map& counts) const override; /// Identifier for this featurizer - const static std::string id; + const static util::string_view id; }; // declare the valid instantiations for this featurizer diff --git a/include/parser/analyzers/featurizers/featurizer_factory.h b/include/parser/analyzers/featurizers/featurizer_factory.h index 41328010a..a22c321bc 100644 --- a/include/parser/analyzers/featurizers/featurizer_factory.h +++ b/include/parser/analyzers/featurizers/featurizer_factory.h @@ -51,9 +51,6 @@ class featurizer_factory */ template void register_featurizer(); - - /// maps id strings to the factory method used to create that class - std::unordered_map methods_; }; /** diff --git a/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h b/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h index dd3a30e64..34fec6c9a 100644 --- a/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h +++ b/include/parser/analyzers/featurizers/semi_skeleton_featurizer.h @@ -12,6 +12,7 @@ #include "parser/analyzers/featurizers/tree_featurizer.h" #include "util/clonable.h" +#include "util/string_view.h" namespace meta { @@ -38,7 +39,7 @@ class semi_skeleton_featurizer feature_map& counts) const override; /// Identifier for this featurizer - const static std::string id; + const static util::string_view id; }; // declare the valid instantiations for this featurizer diff --git a/include/parser/analyzers/featurizers/skeleton_featurizer.h b/include/parser/analyzers/featurizers/skeleton_featurizer.h index 6a1c93066..aef72c3a6 100644 --- a/include/parser/analyzers/featurizers/skeleton_featurizer.h +++ b/include/parser/analyzers/featurizers/skeleton_featurizer.h @@ -12,6 +12,7 @@ #include "parser/analyzers/tree_analyzer.h" #include "util/clonable.h" +#include "util/string_view.h" namespace meta { @@ -37,7 +38,7 @@ class skeleton_featurizer feature_map& counts) const override; /// Identifier for this featurizer - const static std::string id; + const static util::string_view id; }; // declare the valid instantiations for this featurizer diff --git a/include/parser/analyzers/featurizers/subtree_featurizer.h b/include/parser/analyzers/featurizers/subtree_featurizer.h index b81195788..f2520866d 100644 --- a/include/parser/analyzers/featurizers/subtree_featurizer.h +++ b/include/parser/analyzers/featurizers/subtree_featurizer.h @@ -12,6 +12,7 @@ #include "parser/analyzers/featurizers/tree_featurizer.h" #include "util/clonable.h" +#include "util/string_view.h" namespace meta { @@ -38,7 +39,7 @@ class subtree_featurizer feature_map& counts) const override; /// Identifier for this featurizer - const static std::string id; + const static util::string_view id; }; // declare the valid instantiations for this featurizer diff --git a/include/parser/analyzers/featurizers/tag_featurizer.h b/include/parser/analyzers/featurizers/tag_featurizer.h index 6f14eaf9f..f06fe8b67 100644 --- a/include/parser/analyzers/featurizers/tag_featurizer.h +++ b/include/parser/analyzers/featurizers/tag_featurizer.h @@ -12,6 +12,7 @@ #include "parser/analyzers/featurizers/tree_featurizer.h" #include "util/clonable.h" +#include "util/string_view.h" namespace meta { @@ -37,7 +38,7 @@ class tag_featurizer feature_map& counts) const override; /// Identifier for this featurizer - const static std::string id; + const static util::string_view id; }; // declare the valid instantiations for this featurizer diff --git a/include/parser/analyzers/tree_analyzer.h b/include/parser/analyzers/tree_analyzer.h index 9051037d1..5fa9e4fe7 100644 --- a/include/parser/analyzers/tree_analyzer.h +++ b/include/parser/analyzers/tree_analyzer.h @@ -68,7 +68,7 @@ class tree_analyzer : public util::clonable, tree_analyzer> /** * Identifier for this analyzer. */ - const static std::string id; + const static util::string_view id; private: using tree_featurizer_list diff --git a/include/sequence/analyzers/ngram_pos_analyzer.h b/include/sequence/analyzers/ngram_pos_analyzer.h index ee3b3e6b1..81b86147e 100644 --- a/include/sequence/analyzers/ngram_pos_analyzer.h +++ b/include/sequence/analyzers/ngram_pos_analyzer.h @@ -69,7 +69,7 @@ class ngram_pos_analyzer ngram_pos_analyzer(const ngram_pos_analyzer& other); /// Identifier for this analyzer. - const static std::string id; + const static util::string_view id; private: /** diff --git a/include/util/factory.h b/include/util/factory.h index 127e960e3..b6d66853e 100644 --- a/include/util/factory.h +++ b/include/util/factory.h @@ -14,6 +14,7 @@ #include #include #include +#include "util/string_view.h" namespace meta { @@ -59,7 +60,7 @@ class factory * @param fn The factory method */ template - void add(const std::string& identifier, Function&& fn) + void add(util::string_view identifier, Function&& fn) { if (methods_.find(identifier) != methods_.end()) throw exception{"classifier already registered with that id"}; @@ -74,16 +75,17 @@ class factory * @return a unique_ptr to the new object created */ template - pointer create(const std::string& identifier, Args&&... args) + pointer create(util::string_view identifier, Args&&... args) { if (methods_.find(identifier) == methods_.end()) - throw exception{"unrecognized identifier: \"" + identifier + "\""}; + throw exception{"unrecognized identifier: \"" + + identifier.to_string() + "\""}; return methods_[identifier](std::forward(args)...); } private: /// The internal map of identifiers to factory_methods. - std::unordered_map methods_; + std::unordered_map methods_; }; } } diff --git a/src/analyzers/filters/alpha_filter.cpp b/src/analyzers/filters/alpha_filter.cpp index bbeb70a3c..b2b3946fd 100644 --- a/src/analyzers/filters/alpha_filter.cpp +++ b/src/analyzers/filters/alpha_filter.cpp @@ -14,7 +14,7 @@ namespace analyzers namespace filters { -const std::string alpha_filter::id = "alpha"; +const util::string_view alpha_filter::id = "alpha"; alpha_filter::alpha_filter(std::unique_ptr source) : source_{std::move(source)} diff --git a/src/analyzers/filters/empty_sentence_filter.cpp b/src/analyzers/filters/empty_sentence_filter.cpp index e93ec6bd9..2d1243131 100644 --- a/src/analyzers/filters/empty_sentence_filter.cpp +++ b/src/analyzers/filters/empty_sentence_filter.cpp @@ -12,7 +12,7 @@ namespace analyzers namespace filters { -const std::string empty_sentence_filter::id = "empty-sentence"; +const util::string_view empty_sentence_filter::id = "empty-sentence"; empty_sentence_filter::empty_sentence_filter( std::unique_ptr source) diff --git a/src/analyzers/filters/english_normalizer.cpp b/src/analyzers/filters/english_normalizer.cpp index 75936b098..91b7356d9 100644 --- a/src/analyzers/filters/english_normalizer.cpp +++ b/src/analyzers/filters/english_normalizer.cpp @@ -14,7 +14,7 @@ namespace analyzers namespace filters { -const std::string english_normalizer::id = "normalize"; +const util::string_view english_normalizer::id = "normalize"; english_normalizer::english_normalizer(std::unique_ptr source) : source_{std::move(source)} diff --git a/src/analyzers/filters/icu_filter.cpp b/src/analyzers/filters/icu_filter.cpp index d51e5abe4..b76beb335 100644 --- a/src/analyzers/filters/icu_filter.cpp +++ b/src/analyzers/filters/icu_filter.cpp @@ -13,7 +13,7 @@ namespace analyzers namespace filters { -const std::string icu_filter::id = "icu"; +const util::string_view icu_filter::id = "icu"; icu_filter::icu_filter(std::unique_ptr source, const std::string& id) diff --git a/src/analyzers/filters/length_filter.cpp b/src/analyzers/filters/length_filter.cpp index fd6ace27e..640a696a1 100644 --- a/src/analyzers/filters/length_filter.cpp +++ b/src/analyzers/filters/length_filter.cpp @@ -14,7 +14,7 @@ namespace analyzers namespace filters { -const std::string length_filter::id = "length"; +const util::string_view length_filter::id = "length"; length_filter::length_filter(std::unique_ptr source, uint64_t min, uint64_t max) diff --git a/src/analyzers/filters/list_filter.cpp b/src/analyzers/filters/list_filter.cpp index 47bfe9b95..5abea85ed 100644 --- a/src/analyzers/filters/list_filter.cpp +++ b/src/analyzers/filters/list_filter.cpp @@ -14,7 +14,7 @@ namespace analyzers namespace filters { -const std::string list_filter::id = "list"; +const util::string_view list_filter::id = "list"; list_filter::list_filter(std::unique_ptr source, const std::string& filename, type method) diff --git a/src/analyzers/filters/lowercase_filter.cpp b/src/analyzers/filters/lowercase_filter.cpp index 441371dec..7feb8dfd9 100644 --- a/src/analyzers/filters/lowercase_filter.cpp +++ b/src/analyzers/filters/lowercase_filter.cpp @@ -15,7 +15,7 @@ namespace analyzers namespace filters { -const std::string lowercase_filter::id = "lowercase"; +const util::string_view lowercase_filter::id = "lowercase"; lowercase_filter::lowercase_filter(std::unique_ptr source) : source_{std::move(source)} diff --git a/src/analyzers/filters/porter2_stemmer.cpp b/src/analyzers/filters/porter2_stemmer.cpp index cf9a28870..1ca2209c0 100644 --- a/src/analyzers/filters/porter2_stemmer.cpp +++ b/src/analyzers/filters/porter2_stemmer.cpp @@ -13,7 +13,7 @@ namespace analyzers namespace filters { -const std::string porter2_stemmer::id = "porter2-stemmer"; +const util::string_view porter2_stemmer::id = "porter2-stemmer"; porter2_stemmer::porter2_stemmer(std::unique_ptr source) : source_{std::move(source)} diff --git a/src/analyzers/filters/ptb_normalizer.cpp b/src/analyzers/filters/ptb_normalizer.cpp index 3e79c46b4..77dbf6ece 100644 --- a/src/analyzers/filters/ptb_normalizer.cpp +++ b/src/analyzers/filters/ptb_normalizer.cpp @@ -14,7 +14,7 @@ namespace analyzers namespace filters { -const std::string ptb_normalizer::id = "ptb-normalizer"; +const util::string_view ptb_normalizer::id = "ptb-normalizer"; ptb_normalizer::ptb_normalizer(std::unique_ptr source) : source_{std::move(source)} diff --git a/src/analyzers/filters/sentence_boundary.cpp b/src/analyzers/filters/sentence_boundary.cpp index 894cb37be..d75609e80 100644 --- a/src/analyzers/filters/sentence_boundary.cpp +++ b/src/analyzers/filters/sentence_boundary.cpp @@ -15,7 +15,7 @@ namespace analyzers namespace filters { -const std::string sentence_boundary::id = "sentence-boundary"; +const util::string_view sentence_boundary::id = "sentence-boundary"; // static members std::unordered_set sentence_boundary::punc_set{}; diff --git a/src/analyzers/ngram/ngram_word_analyzer.cpp b/src/analyzers/ngram/ngram_word_analyzer.cpp index c87f08868..075f8289d 100644 --- a/src/analyzers/ngram/ngram_word_analyzer.cpp +++ b/src/analyzers/ngram/ngram_word_analyzer.cpp @@ -17,7 +17,7 @@ namespace analyzers { template -const std::string ngram_word_analyzer::id = "ngram-word"; +const util::string_view ngram_word_analyzer::id = "ngram-word"; template ngram_word_analyzer::ngram_word_analyzer( diff --git a/src/analyzers/tokenizers/character_tokenizer.cpp b/src/analyzers/tokenizers/character_tokenizer.cpp index aed68269d..bd91e5b2d 100644 --- a/src/analyzers/tokenizers/character_tokenizer.cpp +++ b/src/analyzers/tokenizers/character_tokenizer.cpp @@ -14,7 +14,7 @@ namespace analyzers namespace tokenizers { -const std::string character_tokenizer::id = "character-tokenizer"; +const util::string_view character_tokenizer::id = "character-tokenizer"; character_tokenizer::character_tokenizer() : idx_{0} { diff --git a/src/analyzers/tokenizers/icu_tokenizer.cpp b/src/analyzers/tokenizers/icu_tokenizer.cpp index 5c6a580c2..f52e9c9fa 100644 --- a/src/analyzers/tokenizers/icu_tokenizer.cpp +++ b/src/analyzers/tokenizers/icu_tokenizer.cpp @@ -21,7 +21,7 @@ namespace analyzers namespace tokenizers { -const std::string icu_tokenizer::id = "icu-tokenizer"; +const util::string_view icu_tokenizer::id = "icu-tokenizer"; /** * Implementation class for the icu_tokenizer. diff --git a/src/analyzers/tokenizers/whitespace_tokenizer.cpp b/src/analyzers/tokenizers/whitespace_tokenizer.cpp index b7887f722..c5aa72991 100644 --- a/src/analyzers/tokenizers/whitespace_tokenizer.cpp +++ b/src/analyzers/tokenizers/whitespace_tokenizer.cpp @@ -17,7 +17,7 @@ namespace analyzers namespace tokenizers { -const std::string whitespace_tokenizer::id = "whitespace-tokenizer"; +const util::string_view whitespace_tokenizer::id = "whitespace-tokenizer"; whitespace_tokenizer::whitespace_tokenizer() : idx_{0} { diff --git a/src/classify/classifier/dual_perceptron.cpp b/src/classify/classifier/dual_perceptron.cpp index cf156d469..f04f8b28a 100644 --- a/src/classify/classifier/dual_perceptron.cpp +++ b/src/classify/classifier/dual_perceptron.cpp @@ -19,7 +19,7 @@ namespace meta namespace classify { -const std::string dual_perceptron::id = "dual-perceptron"; +const util::string_view dual_perceptron::id = "dual-perceptron"; void dual_perceptron::train(const std::vector& docs) { diff --git a/src/classify/classifier/knn.cpp b/src/classify/classifier/knn.cpp index a5493b405..c5f9fcc3c 100644 --- a/src/classify/classifier/knn.cpp +++ b/src/classify/classifier/knn.cpp @@ -17,7 +17,7 @@ namespace meta namespace classify { -const std::string knn::id = "knn"; +const util::string_view knn::id = "knn"; knn::knn(std::shared_ptr idx, std::shared_ptr f_idx, uint16_t k, diff --git a/src/classify/classifier/logistic_regression.cpp b/src/classify/classifier/logistic_regression.cpp index 9f46003d3..043b97264 100644 --- a/src/classify/classifier/logistic_regression.cpp +++ b/src/classify/classifier/logistic_regression.cpp @@ -14,7 +14,7 @@ namespace meta namespace classify { -const std::string logistic_regression::id = "logistic-regression"; +const util::string_view logistic_regression::id = "logistic-regression"; logistic_regression::logistic_regression( const std::string& prefix, std::shared_ptr idx, diff --git a/src/classify/classifier/naive_bayes.cpp b/src/classify/classifier/naive_bayes.cpp index 73ec1b69e..f9d1337cc 100644 --- a/src/classify/classifier/naive_bayes.cpp +++ b/src/classify/classifier/naive_bayes.cpp @@ -19,7 +19,7 @@ namespace meta namespace classify { -const std::string naive_bayes::id = "naive-bayes"; +const util::string_view naive_bayes::id = "naive-bayes"; naive_bayes::naive_bayes(std::shared_ptr idx, double alpha, double beta) diff --git a/src/classify/classifier/nearest_centroid.cpp b/src/classify/classifier/nearest_centroid.cpp index 9ccc35668..d9245c195 100644 --- a/src/classify/classifier/nearest_centroid.cpp +++ b/src/classify/classifier/nearest_centroid.cpp @@ -16,7 +16,7 @@ namespace meta namespace classify { -const std::string nearest_centroid::id = "nearest-centroid"; +const util::string_view nearest_centroid::id = "nearest-centroid"; nearest_centroid::nearest_centroid(std::shared_ptr idx, std::shared_ptr f_idx) diff --git a/src/classify/classifier/one_vs_all.cpp b/src/classify/classifier/one_vs_all.cpp index bf513e686..bc6b72a46 100644 --- a/src/classify/classifier/one_vs_all.cpp +++ b/src/classify/classifier/one_vs_all.cpp @@ -12,7 +12,7 @@ namespace meta namespace classify { -const std::string one_vs_all::id = "one-vs-all"; +const util::string_view one_vs_all::id = "one-vs-all"; void one_vs_all::train(const std::vector& docs) { diff --git a/src/classify/classifier/one_vs_one.cpp b/src/classify/classifier/one_vs_one.cpp index a40e56c46..6a8535679 100644 --- a/src/classify/classifier/one_vs_one.cpp +++ b/src/classify/classifier/one_vs_one.cpp @@ -14,7 +14,7 @@ namespace meta namespace classify { -const std::string one_vs_one::id = "one-vs-one"; +const util::string_view one_vs_one::id = "one-vs-one"; void one_vs_one::train(const std::vector& docs) { diff --git a/src/classify/classifier/sgd.cpp b/src/classify/classifier/sgd.cpp index cf0d90306..3e809f128 100644 --- a/src/classify/classifier/sgd.cpp +++ b/src/classify/classifier/sgd.cpp @@ -15,7 +15,7 @@ namespace meta namespace classify { -const std::string sgd::id = "sgd"; +const util::string_view sgd::id = "sgd"; sgd::sgd(const std::string& prefix, std::shared_ptr idx, class_label positive, class_label negative, diff --git a/src/classify/classifier/svm_wrapper.cpp b/src/classify/classifier/svm_wrapper.cpp index 8ed7bd8b6..d29ed51a8 100644 --- a/src/classify/classifier/svm_wrapper.cpp +++ b/src/classify/classifier/svm_wrapper.cpp @@ -12,7 +12,7 @@ namespace meta namespace classify { -const std::string svm_wrapper::id = "libsvm"; +const util::string_view svm_wrapper::id = "libsvm"; decltype(svm_wrapper::options_) svm_wrapper::options_ = {{svm_wrapper::kernel::None, ""}, diff --git a/src/classify/classifier/winnow.cpp b/src/classify/classifier/winnow.cpp index cec358106..e45d32b06 100644 --- a/src/classify/classifier/winnow.cpp +++ b/src/classify/classifier/winnow.cpp @@ -15,7 +15,7 @@ namespace meta namespace classify { -const std::string winnow::id = "winnow"; +const util::string_view winnow::id = "winnow"; winnow::winnow(std::shared_ptr idx, double m, double gamma, size_t max_iter) diff --git a/src/classify/loss/hinge.cpp b/src/classify/loss/hinge.cpp index ec0d6b3f2..c7f204ad9 100644 --- a/src/classify/loss/hinge.cpp +++ b/src/classify/loss/hinge.cpp @@ -12,7 +12,7 @@ namespace classify namespace loss { -const std::string hinge::id = "hinge"; +const util::string_view hinge::id = "hinge"; double hinge::loss(double prediction, double expected) const { diff --git a/src/classify/loss/huber.cpp b/src/classify/loss/huber.cpp index 8a8ab14c1..6b6d60ba8 100644 --- a/src/classify/loss/huber.cpp +++ b/src/classify/loss/huber.cpp @@ -12,7 +12,7 @@ namespace classify namespace loss { -const std::string huber::id = "huber"; +const util::string_view huber::id = "huber"; double huber::loss(double prediction, double expected) const { diff --git a/src/classify/loss/least_squares.cpp b/src/classify/loss/least_squares.cpp index 53d0280ba..21e770b39 100644 --- a/src/classify/loss/least_squares.cpp +++ b/src/classify/loss/least_squares.cpp @@ -12,7 +12,7 @@ namespace classify namespace loss { -const std::string least_squares::id = "least-squares"; +const util::string_view least_squares::id = "least-squares"; double least_squares::loss(double prediction, double expected) const { diff --git a/src/classify/loss/logistic.cpp b/src/classify/loss/logistic.cpp index 438c9781f..5d49ddbf4 100644 --- a/src/classify/loss/logistic.cpp +++ b/src/classify/loss/logistic.cpp @@ -12,7 +12,7 @@ namespace classify namespace loss { -const std::string logistic::id = "logistic"; +const util::string_view logistic::id = "logistic"; double logistic::loss(double prediction, double expected) const { diff --git a/src/classify/loss/modified_huber.cpp b/src/classify/loss/modified_huber.cpp index 0901fad3b..76d29f9b6 100644 --- a/src/classify/loss/modified_huber.cpp +++ b/src/classify/loss/modified_huber.cpp @@ -12,7 +12,7 @@ namespace classify namespace loss { -const std::string modified_huber::id = "modified-huber"; +const util::string_view modified_huber::id = "modified-huber"; double modified_huber::loss(double prediction, double expected) const { diff --git a/src/classify/loss/perceptron.cpp b/src/classify/loss/perceptron.cpp index 9b484f02e..252a9d746 100644 --- a/src/classify/loss/perceptron.cpp +++ b/src/classify/loss/perceptron.cpp @@ -12,7 +12,7 @@ namespace classify namespace loss { -const std::string perceptron::id = "perceptron"; +const util::string_view perceptron::id = "perceptron"; double perceptron::loss(double prediction, double expected) const { diff --git a/src/classify/loss/smooth_hinge.cpp b/src/classify/loss/smooth_hinge.cpp index c1a728d2a..67d1e530a 100644 --- a/src/classify/loss/smooth_hinge.cpp +++ b/src/classify/loss/smooth_hinge.cpp @@ -12,7 +12,7 @@ namespace classify namespace loss { -const std::string smooth_hinge::id = "smooth-hinge"; +const util::string_view smooth_hinge::id = "smooth-hinge"; double smooth_hinge::loss(double prediction, double expected) const { diff --git a/src/classify/loss/squared_hinge.cpp b/src/classify/loss/squared_hinge.cpp index e4a4bf99a..59bc8e718 100644 --- a/src/classify/loss/squared_hinge.cpp +++ b/src/classify/loss/squared_hinge.cpp @@ -12,7 +12,7 @@ namespace classify namespace loss { -const std::string squared_hinge::id = "squared-hinge"; +const util::string_view squared_hinge::id = "squared-hinge"; double squared_hinge::loss(double prediction, double expected) const { diff --git a/src/index/ranker/absolute_discount.cpp b/src/index/ranker/absolute_discount.cpp index 15144603b..73f38f468 100644 --- a/src/index/ranker/absolute_discount.cpp +++ b/src/index/ranker/absolute_discount.cpp @@ -13,7 +13,7 @@ namespace meta namespace index { -const std::string absolute_discount::id = "absolute-discount"; +const util::string_view absolute_discount::id = "absolute-discount"; absolute_discount::absolute_discount(float delta) : delta_{delta} { diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp index 360808174..726a49ed8 100644 --- a/src/index/ranker/dirichlet_prior.cpp +++ b/src/index/ranker/dirichlet_prior.cpp @@ -12,7 +12,7 @@ namespace meta namespace index { -const std::string dirichlet_prior::id = "dirichlet-prior"; +const util::string_view dirichlet_prior::id = "dirichlet-prior"; dirichlet_prior::dirichlet_prior(float mu) : mu_{mu} { diff --git a/src/index/ranker/jelinek_mercer.cpp b/src/index/ranker/jelinek_mercer.cpp index 6d34c2568..907241e50 100644 --- a/src/index/ranker/jelinek_mercer.cpp +++ b/src/index/ranker/jelinek_mercer.cpp @@ -12,7 +12,7 @@ namespace meta namespace index { -const std::string jelinek_mercer::id = "jelinek-mercer"; +const util::string_view jelinek_mercer::id = "jelinek-mercer"; jelinek_mercer::jelinek_mercer(float lambda) : lambda_{lambda} { diff --git a/src/index/ranker/lm_ranker.cpp b/src/index/ranker/lm_ranker.cpp index 711d152f1..c83a0e794 100644 --- a/src/index/ranker/lm_ranker.cpp +++ b/src/index/ranker/lm_ranker.cpp @@ -14,7 +14,7 @@ namespace meta namespace index { -const std::string language_model_ranker::id = "language-model"; +const util::string_view language_model_ranker::id = "language-model"; float language_model_ranker::score_one(const score_data& sd) { diff --git a/src/index/ranker/okapi_bm25.cpp b/src/index/ranker/okapi_bm25.cpp index 5c2d42452..7969266a5 100644 --- a/src/index/ranker/okapi_bm25.cpp +++ b/src/index/ranker/okapi_bm25.cpp @@ -14,7 +14,7 @@ namespace meta namespace index { -const std::string okapi_bm25::id = "bm25"; +const util::string_view okapi_bm25::id = "bm25"; okapi_bm25::okapi_bm25(float k1, float b, float k3) : k1_{k1}, b_{b}, k3_{k3} { diff --git a/src/index/ranker/pivoted_length.cpp b/src/index/ranker/pivoted_length.cpp index b81c6faa0..589f0c9e1 100644 --- a/src/index/ranker/pivoted_length.cpp +++ b/src/index/ranker/pivoted_length.cpp @@ -13,7 +13,7 @@ namespace meta namespace index { -const std::string pivoted_length::id = "pivoted-length"; +const util::string_view pivoted_length::id = "pivoted-length"; pivoted_length::pivoted_length(float s) : s_{s} { diff --git a/src/parser/analyzers/featurizers/branch_featurizer.cpp b/src/parser/analyzers/featurizers/branch_featurizer.cpp index aa0f7ae6a..55bb33dee 100644 --- a/src/parser/analyzers/featurizers/branch_featurizer.cpp +++ b/src/parser/analyzers/featurizers/branch_featurizer.cpp @@ -9,7 +9,7 @@ namespace analyzers { template -const std::string branch_featurizer::id = "branch"; +const util::string_view branch_featurizer::id = "branch"; namespace { diff --git a/src/parser/analyzers/featurizers/depth_featurizer.cpp b/src/parser/analyzers/featurizers/depth_featurizer.cpp index e1eba3762..35d026641 100644 --- a/src/parser/analyzers/featurizers/depth_featurizer.cpp +++ b/src/parser/analyzers/featurizers/depth_featurizer.cpp @@ -9,7 +9,7 @@ namespace analyzers { template -const std::string depth_featurizer::id = "depth"; +const util::string_view depth_featurizer::id = "depth"; namespace { diff --git a/src/parser/analyzers/featurizers/semi_skeleton_featurizer.cpp b/src/parser/analyzers/featurizers/semi_skeleton_featurizer.cpp index 6a0e8bf2e..5206c4196 100644 --- a/src/parser/analyzers/featurizers/semi_skeleton_featurizer.cpp +++ b/src/parser/analyzers/featurizers/semi_skeleton_featurizer.cpp @@ -9,7 +9,7 @@ namespace analyzers { template -const std::string semi_skeleton_featurizer::id = "semi-skel"; +const util::string_view semi_skeleton_featurizer::id = "semi-skel"; namespace { @@ -34,13 +34,14 @@ class semi_skeleton_visitor : public parser::const_visitor }); rep += ")"; - counts[semi_skeleton_featurizer::id + "-" + rep_cat + rep] += 1; + counts[semi_skeleton_featurizer::id.to_string() + "-" + rep_cat + + rep] += 1; return "(" + rep; } std::string operator()(const parser::leaf_node& ln) override { - counts[semi_skeleton_featurizer::id + "-(" + counts[semi_skeleton_featurizer::id.to_string() + "-(" + static_cast(ln.category()) + ")"] += 1; return "()"; } diff --git a/src/parser/analyzers/featurizers/skeleton_featurizer.cpp b/src/parser/analyzers/featurizers/skeleton_featurizer.cpp index bfd6c03d3..52176c2e4 100644 --- a/src/parser/analyzers/featurizers/skeleton_featurizer.cpp +++ b/src/parser/analyzers/featurizers/skeleton_featurizer.cpp @@ -9,7 +9,7 @@ namespace analyzers { template -const std::string skeleton_featurizer::id = "skel"; +const util::string_view skeleton_featurizer::id = "skel"; namespace { diff --git a/src/parser/analyzers/featurizers/subtree_featurizer.cpp b/src/parser/analyzers/featurizers/subtree_featurizer.cpp index e5121a571..6db769bfc 100644 --- a/src/parser/analyzers/featurizers/subtree_featurizer.cpp +++ b/src/parser/analyzers/featurizers/subtree_featurizer.cpp @@ -9,7 +9,7 @@ namespace analyzers { template -const std::string subtree_featurizer::id = "subtree"; +const util::string_view subtree_featurizer::id = "subtree"; namespace { @@ -36,13 +36,13 @@ class subtree_visitor : public parser::const_visitor }); rep += ")"; - counts[subtree_featurizer::id + "-" + rep] += 1; + counts[subtree_featurizer::id.to_string() + "-" + rep] += 1; } void operator()(const parser::leaf_node& ln) override { auto rep = "(" + static_cast(ln.category()) + ")"; - counts[subtree_featurizer::id + "-" + rep] += 1; + counts[subtree_featurizer::id.to_string() + "-" + rep] += 1; } private: diff --git a/src/parser/analyzers/featurizers/tag_featurizer.cpp b/src/parser/analyzers/featurizers/tag_featurizer.cpp index fe720a5f0..0c6a9aa9e 100644 --- a/src/parser/analyzers/featurizers/tag_featurizer.cpp +++ b/src/parser/analyzers/featurizers/tag_featurizer.cpp @@ -9,7 +9,7 @@ namespace analyzers { template -const std::string tag_featurizer::id = "tag"; +const util::string_view tag_featurizer::id = "tag"; namespace { @@ -26,7 +26,7 @@ class tag_visitor : public parser::const_visitor void operator()(const parser::internal_node& in) override { - counts[tag_featurizer::id + "-" + counts[tag_featurizer::id.to_string() + "-" + static_cast(in.category())] += 1; in.each_child([&](const parser::node* child) { @@ -36,7 +36,7 @@ class tag_visitor : public parser::const_visitor void operator()(const parser::leaf_node& ln) override { - counts[tag_featurizer::id + "-" + counts[tag_featurizer::id.to_string() + "-" + static_cast(ln.category())] += 1; } diff --git a/src/parser/analyzers/tree_analyzer.cpp b/src/parser/analyzers/tree_analyzer.cpp index 2fea07e7f..c6d9f503d 100644 --- a/src/parser/analyzers/tree_analyzer.cpp +++ b/src/parser/analyzers/tree_analyzer.cpp @@ -15,7 +15,7 @@ namespace analyzers { template -const std::string tree_analyzer::id = "tree"; +const util::string_view tree_analyzer::id = "tree"; template tree_analyzer::tree_analyzer(std::unique_ptr stream, diff --git a/src/sequence/analyzers/ngram_pos_analyzer.cpp b/src/sequence/analyzers/ngram_pos_analyzer.cpp index 9c750611f..4e290f638 100644 --- a/src/sequence/analyzers/ngram_pos_analyzer.cpp +++ b/src/sequence/analyzers/ngram_pos_analyzer.cpp @@ -16,7 +16,7 @@ namespace analyzers { template -const std::string ngram_pos_analyzer::id = "ngram-pos"; +const util::string_view ngram_pos_analyzer::id = "ngram-pos"; template ngram_pos_analyzer::ngram_pos_analyzer(uint16_t n, From 221348311fb5afe2dc911fa4662ec16bb8e86ea8 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 6 Sep 2015 17:37:33 -0500 Subject: [PATCH 247/481] Bump cpptoml version. --- deps/cpptoml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/cpptoml b/deps/cpptoml index 2dba80d3f..70b977247 160000 --- a/deps/cpptoml +++ b/deps/cpptoml @@ -1 +1 @@ -Subproject commit 2dba80d3f1e87a3d4a5e01fb13494d2553e30df8 +Subproject commit 70b977247d97a5d0766c1d3f91a0ad0427970cf9 From 1e586f2234916a5d929555b2c54c88f9f0cc83f8 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 6 Sep 2015 17:53:36 -0500 Subject: [PATCH 248/481] Use cpptoml::option::value_or to simplify code. --- src/analyzers/tokenizers/icu_tokenizer.cpp | 5 +- src/classify/classifier/dual_perceptron.cpp | 20 ++++---- src/classify/classifier/knn.cpp | 6 +-- .../classifier/logistic_regression.cpp | 47 +++++++------------ src/classify/classifier/naive_bayes.cpp | 10 ++-- src/classify/classifier/sgd.cpp | 25 +++------- src/classify/tools/classify.cpp | 5 +- src/corpus/corpus.cpp | 9 +--- src/index/forward_index.cpp | 8 ++-- src/index/inverted_index.cpp | 7 +-- src/index/ranker/okapi_bm25.cpp | 20 +++----- src/index/ranker/pivoted_length.cpp | 4 +- src/index/tools/search.cpp | 4 +- 13 files changed, 53 insertions(+), 117 deletions(-) diff --git a/src/analyzers/tokenizers/icu_tokenizer.cpp b/src/analyzers/tokenizers/icu_tokenizer.cpp index f52e9c9fa..bad28048e 100644 --- a/src/analyzers/tokenizers/icu_tokenizer.cpp +++ b/src/analyzers/tokenizers/icu_tokenizer.cpp @@ -152,10 +152,7 @@ std::unique_ptr { auto language = config.get_as("language"); auto country = config.get_as("country"); - bool suppress_tags = false; - - if (auto stags = config.get_as("suppress-tags")) - suppress_tags = *stags; + bool suppress_tags = config.get_as("suppress-tags").value_or(false); using exception = token_stream::token_stream_exception; diff --git a/src/classify/classifier/dual_perceptron.cpp b/src/classify/classifier/dual_perceptron.cpp index f04f8b28a..cb12f9abf 100644 --- a/src/classify/classifier/dual_perceptron.cpp +++ b/src/classify/classifier/dual_perceptron.cpp @@ -101,21 +101,17 @@ std::unique_ptr make_classifier(const cpptoml::table& config, std::shared_ptr idx) { - auto alpha = dual_perceptron::default_alpha; - if (auto c_alpha = config.get_as("alpha")) - alpha = *c_alpha; + auto alpha = config.get_as("alpha") + .value_or(dual_perceptron::default_alpha); - auto gamma = dual_perceptron::default_gamma; - if (auto c_gamma = config.get_as("gamma")) - gamma = *c_gamma; + auto gamma = config.get_as("gamma") + .value_or(dual_perceptron::default_gamma); - auto bias = dual_perceptron::default_bias; - if (auto c_bias = config.get_as("bias")) - bias = *c_bias; + auto bias + = config.get_as("bias").value_or(dual_perceptron::default_bias); - auto max_iter = dual_perceptron::default_max_iter; - if (auto c_max_iter = config.get_as("max-iter")) - max_iter = *c_max_iter; + auto max_iter = config.get_as("max-iter") + .value_or(dual_perceptron::default_max_iter); auto kernel = config.get_as("kernel"); if (!kernel) diff --git a/src/classify/classifier/knn.cpp b/src/classify/classifier/knn.cpp index c5f9fcc3c..15ce21727 100644 --- a/src/classify/classifier/knn.cpp +++ b/src/classify/classifier/knn.cpp @@ -134,11 +134,7 @@ std::unique_ptr make_multi_index_classifier( throw classifier_factory::exception{ "knn requires a ranker to be specified in its configuration"}; - bool use_weighted = false; - auto weighted = config.get_as("weighted"); - if (weighted) - use_weighted = *weighted; - + auto use_weighted = config.get_as("weighted").value_or(false); return make_unique(std::move(inv_idx), std::move(idx), *k, index::make_ranker(*ranker), use_weighted); } diff --git a/src/classify/classifier/logistic_regression.cpp b/src/classify/classifier/logistic_regression.cpp index 043b97264..5a5dea93c 100644 --- a/src/classify/classifier/logistic_regression.cpp +++ b/src/classify/classifier/logistic_regression.cpp @@ -59,8 +59,8 @@ class_label logistic_regression::classify(doc_id d_id) auto it = argmax(probs.begin(), probs.end(), [](const std::pair& pair) { - return pair.second; - }); + return pair.second; + }); return it->first; } @@ -70,15 +70,15 @@ void logistic_regression::train(const std::vector& docs) for (const auto& d_id : docs) docs_by_class[idx_->label(d_id)].emplace_back(d_id); using T = decltype(*classifiers_.begin()); - parallel::parallel_for(classifiers_.begin(), classifiers_.end(), - [&](T& pair) - { - auto train_docs = docs_by_class[pair.first]; - auto pivot_docs = docs_by_class[pivot_]; - train_docs.insert(train_docs.end(), pivot_docs.begin(), - pivot_docs.end()); - pair.second.train(train_docs); - }); + parallel::parallel_for( + classifiers_.begin(), classifiers_.end(), [&](T& pair) + { + auto train_docs = docs_by_class[pair.first]; + auto pivot_docs = docs_by_class[pivot_]; + train_docs.insert(train_docs.end(), pivot_docs.begin(), + pivot_docs.end()); + pair.second.train(train_docs); + }); } void logistic_regression::reset() @@ -96,25 +96,12 @@ std::unique_ptr make_classifier( throw classifier_factory::exception{ "prefix must be specified for logistic-regression in config"}; - auto alpha = sgd::default_alpha; - if (auto c_alpha = config.get_as("alpha")) - alpha = *c_alpha; - - auto gamma = sgd::default_gamma; - if (auto c_gamma = config.get_as("gamma")) - gamma = *c_gamma; - - auto bias = sgd::default_bias; - if (auto c_bias = config.get_as("bias")) - bias = *c_bias; - - auto lambda = sgd::default_lambda; - if (auto c_lambda = config.get_as("lambda")) - lambda = *c_lambda; - - auto max_iter = sgd::default_max_iter; - if (auto c_max_iter = config.get_as("max-iter")) - max_iter = *c_max_iter; + auto alpha = config.get_as("alpha").value_or(sgd::default_alpha); + auto gamma = config.get_as("gamma").value_or(sgd::default_gamma); + auto bias = config.get_as("bias").value_or(sgd::default_bias); + auto lambda = config.get_as("lambda").value_or(sgd::default_lambda); + auto max_iter + = config.get_as("max-iter").value_or(sgd::default_max_iter); return make_unique(*prefix, std::move(idx), alpha, gamma, bias, lambda, max_iter); diff --git a/src/classify/classifier/naive_bayes.cpp b/src/classify/classifier/naive_bayes.cpp index f9d1337cc..ba6ada64d 100644 --- a/src/classify/classifier/naive_bayes.cpp +++ b/src/classify/classifier/naive_bayes.cpp @@ -146,13 +146,11 @@ std::unique_ptr make_classifier(const cpptoml::table& config, std::shared_ptr idx) { - auto alpha = naive_bayes::default_alpha; - if (auto c_alpha = config.get_as("alpha")) - alpha = *c_alpha; + auto alpha + = config.get_as("alpha").value_or(naive_bayes::default_alpha); - auto beta = naive_bayes::default_beta; - if (auto c_beta = config.get_as("beta")) - beta = *c_beta; + auto beta + = config.get_as("beta").value_or(naive_bayes::default_beta); return make_unique(std::move(idx), alpha, beta); } diff --git a/src/classify/classifier/sgd.cpp b/src/classify/classifier/sgd.cpp index 3e809f128..db6d14203 100644 --- a/src/classify/classifier/sgd.cpp +++ b/src/classify/classifier/sgd.cpp @@ -139,25 +139,12 @@ std::unique_ptr throw binary_classifier_factory::exception{ "prefix must be specified for sgd in config"}; - auto alpha = sgd::default_alpha; - if (auto c_alpha = config.get_as("alpha")) - alpha = *c_alpha; - - auto gamma = sgd::default_gamma; - if (auto c_gamma = config.get_as("gamma")) - gamma = *c_gamma; - - auto bias = sgd::default_bias; - if (auto c_bias = config.get_as("bias")) - bias = *c_bias; - - auto lambda = sgd::default_lambda; - if (auto c_lambda = config.get_as("lambda")) - lambda = *c_lambda; - - auto max_iter = sgd::default_max_iter; - if (auto c_max_iter = config.get_as("max-iter")) - max_iter = *c_max_iter; + auto alpha = config.get_as("alpha").value_or(sgd::default_alpha); + auto gamma = config.get_as("gamma").value_or(sgd::default_gamma); + auto bias = config.get_as("bias").value_or(sgd::default_bias); + auto lambda = config.get_as("lambda").value_or(sgd::default_lambda); + auto max_iter + = config.get_as("max-iter").value_or(sgd::default_max_iter); return make_unique( *prefix, std::move(idx), std::move(positive), std::move(negative), diff --git a/src/classify/tools/classify.cpp b/src/classify/tools/classify.cpp index 794190d90..cedeb8d51 100644 --- a/src/classify/tools/classify.cpp +++ b/src/classify/tools/classify.cpp @@ -106,10 +106,7 @@ int main(int argc, char* argv[]) else classifier = classify::make_classifier(*class_config, f_idx); - bool even = false; - auto even_split = class_config->get_as("even-split"); - if (even_split && *even_split == "true") - even = true; + auto even = class_config->get_as("even-split").value_or(false); cv(*f_idx, *classifier, even); return 0; diff --git a/src/corpus/corpus.cpp b/src/corpus/corpus.cpp index feae632d4..2d4667b84 100644 --- a/src/corpus/corpus.cpp +++ b/src/corpus/corpus.cpp @@ -65,13 +65,8 @@ std::unique_ptr corpus::load(const std::string& config_file) if (!type) throw corpus_exception{"type missing from corpus configuration file"}; - auto enc = corpus_config.get_as("encoding"); - std::string encoding; - if (enc) - encoding = *enc; - else - encoding = "utf-8"; - + auto encoding + = corpus_config.get_as("encoding").value_or("utf-8"); std::unique_ptr result; if (*type == "file-corpus") diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 7b4e6a9b5..603017f8e 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -203,12 +203,10 @@ void forward_index::create_index(const std::string& config_file) } else { - uint64_t ram_budget = 1024; - if (auto cfg_ram_budget = config.get_as("indexer-ram-budget")) - ram_budget = static_cast(*cfg_ram_budget); + auto ram_budget = static_cast( + config.get_as("indexer-ram-budget").value_or(1024)); - auto uninvert = config.get_as("uninvert"); - if (uninvert && *uninvert) + if (config.get_as("uninvert").value_or(false)) { LOG(info) << "Creating index by uninverting: " << index_name() << ENDLG; diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 011ef890f..980d76775 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -120,11 +120,8 @@ void inverted_index::create_index(const std::string& config_file) auto docs = corpus::corpus::load(config_file); auto config = cpptoml::parse_file(config_file); - auto cfg_ram_budget = config.get_as("indexer-ram-budget"); - - uint64_t ram_budget = 1024; - if (cfg_ram_budget) - ram_budget = static_cast(*cfg_ram_budget); + auto ram_budget = static_cast( + config.get_as("indexer-ram-budget").value_or(1024)); postings_inverter inverter{index_name()}; { diff --git a/src/index/ranker/okapi_bm25.cpp b/src/index/ranker/okapi_bm25.cpp index 7969266a5..164fcb128 100644 --- a/src/index/ranker/okapi_bm25.cpp +++ b/src/index/ranker/okapi_bm25.cpp @@ -30,11 +30,11 @@ float okapi_bm25::score_one(const score_data& sd) 1.0f + (sd.num_docs - sd.doc_count + 0.5f) / (sd.doc_count + 0.5f)); float TF = ((k1_ + 1.0f) * sd.doc_term_count) - / ((k1_ * ((1.0f - b_) + b_ * doc_len / sd.avg_dl)) - + sd.doc_term_count); + / ((k1_ * ((1.0f - b_) + b_ * doc_len / sd.avg_dl)) + + sd.doc_term_count); float QTF = ((k3_ + 1.0f) * sd.query_term_weight) - / (k3_ + sd.query_term_weight); + / (k3_ + sd.query_term_weight); return TF * IDF * QTF; } @@ -42,17 +42,9 @@ float okapi_bm25::score_one(const score_data& sd) template <> std::unique_ptr make_ranker(const cpptoml::table& config) { - auto k1 = okapi_bm25::default_k1; - if (auto c_k1 = config.get_as("k1")) - k1 = *c_k1; - - auto b = okapi_bm25::default_b; - if (auto c_b = config.get_as("b")) - b = *c_b; - - auto k3 = okapi_bm25::default_k3; - if (auto c_k3 = config.get_as("k3")) - k3 = *c_k3; + auto k1 = config.get_as("k1").value_or(okapi_bm25::default_k1); + auto b = config.get_as("b").value_or(okapi_bm25::default_b); + auto k3 = config.get_as("k3").value_or(okapi_bm25::default_k3); return make_unique(k1, b, k3); } diff --git a/src/index/ranker/pivoted_length.cpp b/src/index/ranker/pivoted_length.cpp index 589f0c9e1..2daf3f99b 100644 --- a/src/index/ranker/pivoted_length.cpp +++ b/src/index/ranker/pivoted_length.cpp @@ -35,9 +35,7 @@ template <> std::unique_ptr make_ranker(const cpptoml::table& config) { - auto s = pivoted_length::default_s; - if (auto c_s = config.get_as("s")) - s = *c_s; + auto s = config.get_as("s").value_or(pivoted_length::default_s); return make_unique(s); } } diff --git a/src/index/tools/search.cpp b/src/index/tools/search.cpp index 8fcb8ad04..ae3bd93d1 100644 --- a/src/index/tools/search.cpp +++ b/src/index/tools/search.cpp @@ -50,9 +50,7 @@ int main(int argc, char* argv[]) auto ranker = index::make_ranker(*group); // Use UTF-8 for the default encoding unless otherwise specified. - std::string encoding = "utf-8"; - if (auto enc = config.get_as("encoding")) - encoding = *enc; + auto encoding = config.get_as("encoding").value_or("utf-8"); // Time how long it takes to create the index. By default, common::time's // unit of measurement is milliseconds. From b5bb75796171eb6a75a8d5cf8853908f93862e21 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 6 Sep 2015 18:08:21 -0500 Subject: [PATCH 249/481] Fix undefined reference errors for contexpr defaults. --- src/classify/classifier/dual_perceptron.cpp | 4 ++++ src/classify/classifier/naive_bayes.cpp | 2 ++ src/classify/classifier/sgd.cpp | 5 +++++ src/index/ranker/okapi_bm25.cpp | 4 ++++ src/index/ranker/pivoted_length.cpp | 1 + 5 files changed, 16 insertions(+) diff --git a/src/classify/classifier/dual_perceptron.cpp b/src/classify/classifier/dual_perceptron.cpp index cb12f9abf..84284a26e 100644 --- a/src/classify/classifier/dual_perceptron.cpp +++ b/src/classify/classifier/dual_perceptron.cpp @@ -20,6 +20,10 @@ namespace classify { const util::string_view dual_perceptron::id = "dual-perceptron"; +const constexpr double dual_perceptron::default_alpha; +const constexpr double dual_perceptron::default_gamma; +const constexpr double dual_perceptron::default_bias; +const constexpr uint64_t dual_perceptron::default_max_iter; void dual_perceptron::train(const std::vector& docs) { diff --git a/src/classify/classifier/naive_bayes.cpp b/src/classify/classifier/naive_bayes.cpp index ba6ada64d..d7dd032a7 100644 --- a/src/classify/classifier/naive_bayes.cpp +++ b/src/classify/classifier/naive_bayes.cpp @@ -20,6 +20,8 @@ namespace classify { const util::string_view naive_bayes::id = "naive-bayes"; +const constexpr double naive_bayes::default_alpha; +const constexpr double naive_bayes::default_beta; naive_bayes::naive_bayes(std::shared_ptr idx, double alpha, double beta) diff --git a/src/classify/classifier/sgd.cpp b/src/classify/classifier/sgd.cpp index db6d14203..de10513ef 100644 --- a/src/classify/classifier/sgd.cpp +++ b/src/classify/classifier/sgd.cpp @@ -16,6 +16,11 @@ namespace classify { const util::string_view sgd::id = "sgd"; +const constexpr double sgd::default_alpha; +const constexpr double sgd::default_gamma; +const constexpr double sgd::default_bias; +const constexpr double sgd::default_lambda; +const constexpr size_t sgd::default_max_iter; sgd::sgd(const std::string& prefix, std::shared_ptr idx, class_label positive, class_label negative, diff --git a/src/index/ranker/okapi_bm25.cpp b/src/index/ranker/okapi_bm25.cpp index 164fcb128..0b3cb7808 100644 --- a/src/index/ranker/okapi_bm25.cpp +++ b/src/index/ranker/okapi_bm25.cpp @@ -15,6 +15,10 @@ namespace index { const util::string_view okapi_bm25::id = "bm25"; +const constexpr float okapi_bm25::default_k1; +const constexpr float okapi_bm25::default_b; +const constexpr float okapi_bm25::default_k3; + okapi_bm25::okapi_bm25(float k1, float b, float k3) : k1_{k1}, b_{b}, k3_{k3} { diff --git a/src/index/ranker/pivoted_length.cpp b/src/index/ranker/pivoted_length.cpp index 2daf3f99b..0f66171bb 100644 --- a/src/index/ranker/pivoted_length.cpp +++ b/src/index/ranker/pivoted_length.cpp @@ -14,6 +14,7 @@ namespace index { const util::string_view pivoted_length::id = "pivoted-length"; +const constexpr float pivoted_length::default_s; pivoted_length::pivoted_length(float s) : s_{s} { From 67e1fcb6b3e75ffdd247d1867fb58a5fd2c1fce8 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sun, 6 Sep 2015 19:47:01 -0500 Subject: [PATCH 250/481] Bump porter2_stemmer version. --- deps/porter2_stemmer | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/porter2_stemmer b/deps/porter2_stemmer index a9718c892..06891b3bf 160000 --- a/deps/porter2_stemmer +++ b/deps/porter2_stemmer @@ -1 +1 @@ -Subproject commit a9718c892a935baff774dfb450f21e864a14c311 +Subproject commit 06891b3bf8daf7cb8ca56954aff646a86f20724b From 869ef15527c0a9185ad4f93cf7db1d918c8b69a7 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 7 Sep 2015 18:24:39 -0500 Subject: [PATCH 251/481] create util::fixed_heap --- include/util/fixed_heap.h | 71 +++++++++++++++++++++++++++++++++++++ include/util/fixed_heap.tcc | 60 +++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 include/util/fixed_heap.h create mode 100644 include/util/fixed_heap.tcc diff --git a/include/util/fixed_heap.h b/include/util/fixed_heap.h new file mode 100644 index 000000000..fae715d3b --- /dev/null +++ b/include/util/fixed_heap.h @@ -0,0 +1,71 @@ +/** + * @file fixed_heap.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_FIXED_HEAP_H_ +#define META_FIXED_HEAP_H_ + +#include + +namespace meta +{ +namespace util +{ +/** + * Keeps a constant number of high-priority elements. This is useful for finding + * the "top-k" T elements using the comparison function Comp. + */ +template +class fixed_heap +{ + public: + /** + * @param max_elems + * @param comp The priority comparison function for elements in this heap + */ + fixed_heap(uint64_t max_elems, Comp comp); + + /** + * @param elem The element to insert; it may or may not be inserted + * depending on the size and priority of other elements in the heap + */ + void push(const T& elem); + + /** + * @param elem The element to emplace; it may or may not be inserted + * depending on the size and priority of other elements in the heap + */ + template + void emplace(Args&&... args); + + /** + * @return the current number of elements in this heap; will always be less + * than or equal to max_elems() + */ + uint64_t size() const; + + /** + * @return the maximum number of elements this heap will store + */ + uint64_t max_elems() const; + + /** + * @return a reverse-sorted list + */ + std::vector reverse_and_clear(); + + private: + uint64_t max_elems_; + Comp comp_; + std::priority_queue, decltype(comp_)> pq_; +}; +} +} + +#include "fixed_heap.tcc" +#endif diff --git a/include/util/fixed_heap.tcc b/include/util/fixed_heap.tcc new file mode 100644 index 000000000..23bd42bd2 --- /dev/null +++ b/include/util/fixed_heap.tcc @@ -0,0 +1,60 @@ +/** + * @file fixed_heap.tcc + * @author Sean Massung + */ + +namespace meta +{ +namespace util +{ +template +fixed_heap::fixed_heap(uint64_t max_elems, Comp comp) + : max_elems_{max_elems}, comp_{comp}, pq_{comp} +{ + // nothing +} + +template +template +void fixed_heap::emplace(Args&&... args) +{ + pq_.emplace(std::forward(args)...); + if (size() > max_elems()) + pq_.pop(); +} + +template +void fixed_heap::push(const T& elem) +{ + pq_.push(elem); + if (size() > max_elems()) + pq_.pop(); +} + +template +uint64_t fixed_heap::size() const +{ + return pq_.size(); +} + +template +uint64_t fixed_heap::max_elems() const +{ + return max_elems_; +} + +template +std::vector fixed_heap::reverse_and_clear() +{ + std::vector sorted; + sorted.reserve(size()); + while (!pq_.empty()) + { + sorted.emplace_back(std::move(pq_.top())); + pq_.pop(); + } + std::reverse(sorted.begin(), sorted.end()); + return sorted; +} +} +} From b0ee54e9d11c2c0feb5000e1de53101ea867cbd9 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 7 Sep 2015 18:27:09 -0500 Subject: [PATCH 252/481] use util::fixed_heap in lm::diff --- src/lm/diff.cpp | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 7017356ed..4bddda12d 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -8,6 +8,7 @@ #include "lm/diff.h" #include "porter2_stemmer.h" #include "utf/utf.h" +#include "util/fixed_heap.h" namespace meta { @@ -46,6 +47,8 @@ diff::diff(const cpptoml::table& config) : lm_{config} auto lambda = table->get_as("lambda"); lambda_ = lambda ? *lambda : 0.5; + if (lambda_ < 0.0 || lambda_ > 1.0) + throw diff_exception{"lambda value has to be on [0,1]"}; auto lm_gen = table->get_as("lm-generate"); lm_generate_ = lm_gen ? *lm_gen : false; @@ -63,22 +66,12 @@ std::vector> { return a.second < b.second; }; - std::priority_queue, decltype(comp)> candidates{ - comp}; - add(candidates, sent); + util::fixed_heap candidates{max_cand_size_, comp}; seen_.clear(); + add(candidates, sent); step(sent, candidates, 0); - - std::vector sorted; - sorted.reserve(candidates.size()); - while (!candidates.empty()) - { - sorted.emplace_back(std::move(candidates.top())); - candidates.pop(); - } - std::reverse(sorted.begin(), sorted.end()); - return sorted; + return candidates.reverse_and_clear(); } template @@ -88,8 +81,6 @@ void diff::add(PQ& candidates, const sentence& sent) auto score = lambda_ * lm_.perplexity_per_word(sent) + (1.0 - lambda_) * sent.average_weight(); candidates.emplace(sent, score); - if (candidates.size() > max_cand_size_) - candidates.pop(); } uint64_t diff::least_likely_ngram(const sentence& sent) const From 253143a5c723ddc2da9c9a26f35a15ec5f589f99 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 7 Sep 2015 18:32:27 -0500 Subject: [PATCH 253/481] use util::fixed_heap in index::ranker --- src/index/ranker/ranker.cpp | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/src/index/ranker/ranker.cpp b/src/index/ranker/ranker.cpp index f93f7c540..0c67e8b79 100644 --- a/src/index/ranker/ranker.cpp +++ b/src/index/ranker/ranker.cpp @@ -10,6 +10,7 @@ #include "index/postings_data.h" #include "index/ranker/ranker.h" #include "index/score_data.h" +#include "util/fixed_heap.h" namespace meta { @@ -56,14 +57,12 @@ std::vector ranker::score( score_data sd{idx, idx.avg_doc_length(), idx.num_docs(), idx.total_corpus_terms(), query}; - std::vector results; - results.reserve(num_results + 1); // +1 since we use this as a heap and - // prune when it exceeds size num_results auto comp = [](const search_result& a, const search_result& b) { // comparison is reversed since we want a min-heap return a.score > b.score; }; + util::fixed_heap results{num_results, comp}; std::vector postings; postings.reserve(query.counts().size()); @@ -130,24 +129,12 @@ std::vector ranker::score( } } - // add doc to the heap and poll if needed - results.emplace_back(cur_doc, score); - std::push_heap(results.begin(), results.end(), comp); - if (results.size() > num_results) - { - std::pop_heap(results.begin(), results.end(), comp); - results.pop_back(); - } - + results.emplace(cur_doc, score); cur_doc = next_doc; next_doc = doc_id{idx.num_docs()}; } - // heap sort the values - for (auto end = results.end(); end != results.begin(); --end) - std::pop_heap(results.begin(), end, comp); - - return results; + return results.reverse_and_clear(); } float ranker::initial_score(const score_data&) const From c14333fbf2ee9e1e60941004b052eb67773b3e07 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 7 Sep 2015 18:38:41 -0500 Subject: [PATCH 254/481] use util::fixed_heap in top-k --- src/tools/top_k.cpp | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/src/tools/top_k.cpp b/src/tools/top_k.cpp index 19f01a415..b288a01cc 100644 --- a/src/tools/top_k.cpp +++ b/src/tools/top_k.cpp @@ -7,13 +7,13 @@ #include #include #include -#include #include #include "cpptoml.h" #include "corpus/corpus.h" #include "analyzers/analyzer.h" #include "analyzers/filters/all.h" #include "util/progress.h" +#include "util/fixed_heap.h" #include "logging/logger.h" using namespace meta; @@ -60,22 +60,11 @@ int main(int argc, char* argv[]) { return a.second > b.second; }; - std::priority_queue, decltype(comp)> terms{ - comp}; + util::fixed_heap terms{k, comp}; for (auto& term : counts) - { terms.emplace(term); - if (terms.size() > k) - terms.pop(); - } - - std::vector sorted; - while (!terms.empty()) - { - sorted.emplace_back(std::move(terms.top())); - terms.pop(); - } - for (auto it = sorted.rbegin(); it != sorted.rend(); ++it) - std::cout << it->first << "\t" << it->second << std::endl; + auto sorted = terms.reverse_and_clear(); + for (const auto& it : sorted) + std::cout << it.first << "\t" << it.second << std::endl; } From 6b25730b759ca43cd2d80d181a7d37e2ad552f8e Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 7 Sep 2015 18:48:21 -0500 Subject: [PATCH 255/481] use util::fixed_heap in language_model::top_k --- src/lm/language_model.cpp | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index 7e8b8540d..f99a2fbcc 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -11,6 +11,7 @@ #include #include "util/time.h" #include "util/shim.h" +#include "util/fixed_heap.h" #include "lm/language_model.h" #include "logging/logger.h" @@ -120,26 +121,15 @@ std::vector> { return a.second > b.second; }; - std::vector candidates; + util::fixed_heap candidates{k, comp}; - sentence candidate = prev; - candidate.push_back("word"); // the last item is replaced each iteration for (const auto& word : vocabulary_) { auto candidate = sentence{prev.to_string() + " " + word}; - candidates.emplace_back(word, log_prob(candidate)); - std::push_heap(candidates.begin(), candidates.end(), comp); - if (candidates.size() > k) - { - std::pop_heap(candidates.begin(), candidates.end(), comp); - candidates.pop_back(); - } + candidates.emplace(word, log_prob(candidate)); } - for (auto end = candidates.end(); end != candidates.begin(); --end) - std::pop_heap(candidates.begin(), end, comp); - - return candidates; + return candidates.reverse_and_clear(); } void language_model::load_vocab() From 93361ef23f0003751b616f515d1c966c37e7ebd4 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 7 Sep 2015 18:59:21 -0500 Subject: [PATCH 256/481] fix "excess elements in struct initializer"? --- include/util/fixed_heap.tcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/util/fixed_heap.tcc b/include/util/fixed_heap.tcc index 23bd42bd2..09166531d 100644 --- a/include/util/fixed_heap.tcc +++ b/include/util/fixed_heap.tcc @@ -9,7 +9,7 @@ namespace util { template fixed_heap::fixed_heap(uint64_t max_elems, Comp comp) - : max_elems_{max_elems}, comp_{comp}, pq_{comp} + : max_elems_{max_elems}, comp_(comp), pq_{comp} { // nothing } From 5e11693853c4698e90cd8cba04b99d72f9c7da1d Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 7 Sep 2015 19:18:01 -0500 Subject: [PATCH 257/481] add algorithm include to fixed_heap --- include/util/fixed_heap.tcc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/util/fixed_heap.tcc b/include/util/fixed_heap.tcc index 09166531d..2ed01a902 100644 --- a/include/util/fixed_heap.tcc +++ b/include/util/fixed_heap.tcc @@ -3,6 +3,8 @@ * @author Sean Massung */ +#include + namespace meta { namespace util From 0a0ea3ded6399de3909737edc7ee5bf28c58c8e6 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 7 Sep 2015 19:40:32 -0500 Subject: [PATCH 258/481] create a hash specialization for lm::sentence and use it in lm::diff unfortunately, no performance benefit realized on the CoNLL sentences --- include/lm/diff.h | 4 ++-- include/lm/sentence.h | 43 +++++++++++++++++++++++++++++++++++++++++++ src/lm/diff.cpp | 12 ++++++------ src/lm/sentence.cpp | 5 +++++ 4 files changed, 56 insertions(+), 8 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index b19c2c0f3..346290fa2 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -185,8 +185,8 @@ class diff std::vector fwords_; /// Keeps track of sentences that have already been generated so we don't - /// perform redundant calcualtions - std::unordered_set seen_; + /// perform redundant calculations + std::unordered_set seen_; /// How many candidate sentences to store when calling diff::candidates uint64_t max_cand_size_; diff --git a/include/lm/sentence.h b/include/lm/sentence.h index f35126585..9629ea756 100644 --- a/include/lm/sentence.h +++ b/include/lm/sentence.h @@ -10,6 +10,7 @@ #include #include #include +#include "util/hash.h" namespace meta { @@ -101,6 +102,11 @@ class sentence */ const std::vector& operations() const; + /** + * @return the sequence of tokens that comprise this sentence + */ + const std::deque& tokens() const; + /** * @return the token at the front of the sentence */ @@ -185,7 +191,44 @@ class sentence_exception : public std::runtime_error { using std::runtime_error::runtime_error; }; + +inline bool operator==(const sentence& lhs, const sentence& rhs) +{ + return lhs.tokens() == rhs.tokens(); +} + +inline bool operator!=(const sentence& lhs, const sentence& rhs) +{ + return !(lhs == rhs); +} } } +namespace std +{ +template <> +struct hash +{ +#if META_HAS_NONEMPTY_HASH_SUPPORT + meta::util::murmur_hash<> hasher; +#endif + + size_t operator()(const meta::lm::sentence& sent) const noexcept + { +#ifndef META_HAS_NONEMPTY_HASH_SUPPORT + meta::util::murmur_hash<> hasher{89122527}; +#endif + // create a vector of hashes of all the tokens in the sentence + std::vector hashed; + for (const auto& word : sent) + hashed.push_back(hasher( + reinterpret_cast(word.data()), word.size())); + + // hash the hashes as sequences of uint8_ts + return hasher(reinterpret_cast(hashed.data()), + hashed.size() * sizeof(std::size_t)); + } +}; +} + #endif diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 4bddda12d..cd6155362 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -77,7 +77,7 @@ std::vector> template void diff::add(PQ& candidates, const sentence& sent) { - seen_.insert(sent.to_string()); + seen_.insert(sent); auto score = lambda_ * lm_.perplexity_per_word(sent) + (1.0 - lambda_) * sent.average_weight(); candidates.emplace(sent, score); @@ -128,7 +128,7 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) ins_cpy.insert(best_idx, next.first, base_penalty_ + insert_penalty_); - if (seen_.find(ins_cpy.to_string()) == seen_.end()) + if (seen_.find(ins_cpy) == seen_.end()) { add(candidates, ins_cpy); step(ins_cpy, candidates, depth + 1); @@ -138,7 +138,7 @@ void diff::lm_ops(const sentence& sent, PQ& candidates, uint64_t depth) sub_cpy.substitute(best_idx, next.first, base_penalty_ + substitute_penalty_); - if (seen_.find(sub_cpy.to_string()) == seen_.end()) + if (seen_.find(sub_cpy) == seen_.end()) { add(candidates, sub_cpy); step(sub_cpy, candidates, depth + 1); @@ -160,7 +160,7 @@ void diff::insert(const sentence& sent, size_t idx, PQ& candidates, { sentence ins_cpy{sent}; ins_cpy.insert(idx, fw, base_penalty_ + insert_penalty_); - if (seen_.find(ins_cpy.to_string()) == seen_.end()) + if (seen_.find(ins_cpy) == seen_.end()) { add(candidates, ins_cpy); step(ins_cpy, candidates, depth + 1); @@ -184,7 +184,7 @@ void diff::substitute(const sentence& sent, size_t idx, PQ& candidates, continue; sentence subbed{sent}; subbed.substitute(idx, stem, base_penalty_ + substitute_penalty_); - if (seen_.find(subbed.to_string()) == seen_.end()) + if (seen_.find(subbed) == seen_.end()) { add(candidates, subbed); step(subbed, candidates, depth + 1); @@ -199,7 +199,7 @@ void diff::remove(const sentence& sent, size_t idx, PQ& candidates, { sentence rem_cpy{sent}; rem_cpy.remove(idx, base_penalty_ + remove_penalty_); - if (seen_.find(rem_cpy.to_string()) == seen_.end()) + if (seen_.find(rem_cpy) == seen_.end()) { add(candidates, rem_cpy); step(rem_cpy, candidates, depth + 1); diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 47b21968a..24cb979e5 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -119,6 +119,11 @@ const std::vector& sentence::operations() const return ops_; } +const std::deque& sentence::tokens() const +{ + return tokens_; +} + const std::string& sentence::front() const { return tokens_.front(); From d179f647b66ba211e805168595b705ddc77ac7a1 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Mon, 7 Sep 2015 20:06:07 -0500 Subject: [PATCH 259/481] use utf::segmenter in lm::sentence instead of going through analyzer pipeline --- src/lm/sentence.cpp | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 24cb979e5..68426ac8d 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -3,15 +3,13 @@ * @author Sean Massung */ +#include #include #include #include #include #include "lm/sentence.h" -#include "analyzers/analyzer.h" -#include "analyzers/tokenizers/icu_tokenizer.h" -#include "analyzers/tokenizers/whitespace_tokenizer.h" -#include "analyzers/filters/all.h" +#include "utf/segmenter.h" namespace meta { @@ -21,21 +19,14 @@ sentence::sentence(const std::string& text, bool tokenize /* = true */) { if (tokenize) { - using namespace analyzers; - std::unique_ptr stream; - stream = make_unique(); - stream = make_unique(std::move(stream)); - std::string text_copy{text}; // consider changing parameter to non-const - stream->set_content(std::move(text_copy)); - while (*stream) - tokens_.push_back(stream->next()); - - if (tokens_.empty()) - throw sentence_exception{"empty token stream"}; - - // remove sentence markers - tokens_.pop_front(); - tokens_.pop_back(); + utf::segmenter segmenter; + segmenter.set_content(text); + for (const auto& word : segmenter.words()) + { + auto str = segmenter.content(word); + if (!str.empty() && !std::all_of(str.begin(), str.end(), ::isspace)) + tokens_.emplace_back(std::move(str)); + } if (tokens_.empty()) throw sentence_exception{"empty token stream"}; From a441f226fc8e00cc682d1fbbf808d7df0720a1d5 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 8 Sep 2015 01:58:10 -0500 Subject: [PATCH 260/481] Update hash support to mirror N3980. http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n3980.html I think this setup is AMAZING, so I'm just going to adopt it wholesale right now for our hashing support. --- include/util/hash.h | 496 ++++++++++++++++++++++++++++++------- include/util/string_view.h | 42 ++-- 2 files changed, 430 insertions(+), 108 deletions(-) diff --git a/include/util/hash.h b/include/util/hash.h index 189317e9d..516d82d41 100644 --- a/include/util/hash.h +++ b/include/util/hash.h @@ -10,6 +10,8 @@ #ifndef META_UTIL_HASH_H_ #define META_UTIL_HASH_H_ +#include +#include #include #include @@ -18,6 +20,40 @@ namespace meta namespace util { +namespace detail +{ +template +struct static_and; + +template +struct static_and +{ + const static constexpr bool value = B && static_and::value; +}; + +template <> +struct static_and<> +{ + const static constexpr bool value = true; +}; + +template +struct static_add; + +template +struct static_add +{ + const static constexpr std::size_t value = Size + + static_add::value; +}; + +template <> +struct static_add<> +{ + const static constexpr std::size_t value = 0; +}; +} + /** * Implementation of MurmurHash3. Depending on the template parameter, it * will return a 32-bit or 64-bit hash value. @@ -66,65 +102,87 @@ inline uint64_t fmix(uint64_t h) template <> class murmur_hash<4> { - public: - murmur_hash() : seed_{std::random_device{}()} + private: + // this *has* to be uint32_t for OS X clang to correctly resolve + // between the two versions of rotl/fmix in namespace detail above. + uint32_t out_; + std::array buf_; + std::size_t buflen_; + std::size_t total_length_; + + const static constexpr uint32_t c1 = 0xcc9e2d51; + const static constexpr uint32_t c2 = 0x1b873593; + + void handle_block_4(uint32_t block) { + block *= c1; + block = detail::rotl(block, 15); + block *= c2; + + out_ ^= block; + out_ = detail::rotl(out_, 13); + out_ = out_ * 5 + 0xe6546b64; } - murmur_hash(std::size_t seed) : seed_{seed} + public: + using result_type = std::size_t; + + murmur_hash(std::size_t seed) : out_(seed), buflen_{0}, total_length_{0} { } - std::size_t operator()(const uint8_t* data, int len) const + void operator()(const void* in, std::size_t len) { - // this *has* to be uint32_t for OS X clang to correctly resolve - // between the two versions of rotl/fmix in namespace detail above. - uint32_t out = seed_; + auto data = reinterpret_cast(in); + total_length_ += len; - const auto nblocks = len / 4; + // handle 4-byte blocks at a time, starting from the data we had + // "left over" from the last call to operator() + auto end = data + len; + while (buflen_ > 0 && buflen_ < 4 && data < end) + buf_[buflen_++] = *(data++); - const uint32_t c1 = 0xcc9e2d51; - const uint32_t c2 = 0x1b873593; + if (buflen_ / 4 > 0) + { + handle_block_4(reinterpret_cast(buf_.data())[0]); + buflen_ = 0; + } + // now handle the remaining 4-byte blocks in this data + const auto nblocks = (end - data) / 4; auto blocks = reinterpret_cast(data + nblocks * 4); - for (int i = -nblocks; i; ++i) - { - auto k1 = blocks[i]; - - k1 *= c1; - k1 = detail::rotl(k1, 15); - k1 *= c2; - - out ^= k1; - out = detail::rotl(out, 13); - out = out * 5 + 0xe6546b64; - } + handle_block_4(blocks[i]); + // copy over the remaining 3 bytes or less for finalizing or use on + // the next call to operator() const uint8_t* tail = data + nblocks * 4; + buflen_ = end - tail; + assert(buflen_ < 4); + std::copy(tail, end, buf_.begin()); + } + explicit operator std::size_t() + { uint32_t k1 = 0; - switch (len & 3) + switch (buflen_ & 3) { case 3: - k1 ^= tail[2] << 16; + k1 ^= buf_[2] << 16; case 2: - k1 ^= tail[1] << 8; + k1 ^= buf_[1] << 8; case 1: - k1 ^= tail[0]; + k1 ^= buf_[0]; k1 *= c1; k1 = detail::rotl(k1, 15); k1 *= c2; - out ^= k1; + out_ ^= k1; } - out ^= len; + out_ ^= total_length_; - return detail::fmix(out); + return detail::fmix(out_); } - - private: - std::size_t seed_; }; /** @@ -133,116 +191,370 @@ class murmur_hash<4> template <> class murmur_hash<8> { - public: - murmur_hash() : seed_{std::random_device{}()} + private: + uint64_t h1_; + uint64_t h2_; + std::array buf_; + std::size_t buflen_; + std::size_t total_length_; + + const static constexpr uint64_t c1 = 0x87c37b91114253d5LLU; + const static constexpr uint64_t c2 = 0x4cf5ad432745937fLLU; + + void handle_block_16(const uint8_t* start) { + auto blocks = reinterpret_cast(start); + auto k1 = blocks[0]; + auto k2 = blocks[1]; + + k1 *= c1; + k1 = detail::rotl(k1, 31); + k1 *= c2; + h1_ ^= k1; + + h1_ = detail::rotl(h1_, 27); + h1_ += h2_; + h1_ = h1_ * 5 + 0x52dce729; + + k2 *= c2; + k2 = detail::rotl(k2, 33); + k2 *= c1; + h2_ ^= k2; + + h2_ = detail::rotl(h2_, 31); + h2_ += h1_; + h2_ = h2_ * 5 + 0x38495ab5; } - murmur_hash(uint64_t seed) : seed_{seed} + public: + using result_type = std::size_t; + + murmur_hash(uint64_t seed) + : h1_{seed}, h2_{seed}, buflen_{0}, total_length_{0} { } - std::size_t operator()(const uint8_t* data, int len) const + void operator()(const void* in, std::size_t len) { - const auto nblocks = len / 16; + auto data = reinterpret_cast(in); + total_length_ += len; - auto h1 = seed_; - auto h2 = seed_; + // handle 16-byte blocks at a time, starting from the data we had + // "left over" from the last call to operator() + auto end = data + len; + while (buflen_ > 0 && buflen_ < 16 && data < end) + buf_[buflen_++] = *(data++); - const uint64_t c1 = 0x87c37b91114253d5LLU; - const uint64_t c2 = 0x4cf5ad432745937fLLU; - - auto blocks = reinterpret_cast(data); + if (buflen_ / 16 > 0) + { + handle_block_16(buf_.data()); + buflen_ = 0; + } + // now handle the remaining 16-byte blocks in this data + const auto nblocks = (end - data) / 16; for (int i = 0; i < nblocks; ++i) { - auto k1 = blocks[i * 2]; - auto k2 = blocks[i * 2 + 1]; - - k1 *= c1; - k1 = detail::rotl(k1, 31); - k1 *= c2; - h1 ^= k1; - - h1 = detail::rotl(h1, 27); - h1 += h2; - h1 = h1 * 5 + 0x52dce729; - - k2 *= c2; - k2 = detail::rotl(k2, 33); - k2 *= c1; - h2 ^= k2; - - h2 = detail::rotl(h2, 31); - h2 += h1; - h2 = h2 * 5 + 0x38495ab5; + handle_block_16(data); + data += 16; } - auto tail = data + nblocks * 16; + // copy over the remaining 15 bytes or less for finalizing or use + // on the next call to operator() + buflen_ = end - data; + assert(buflen_ < 16); + std::copy(data, end, buf_.begin()); + } + explicit operator std::size_t() + { uint64_t k1 = 0; uint64_t k2 = 0; - switch (len & 15) + switch (buflen_) { case 15: - k2 ^= static_cast(tail[14]) << 48; + k2 ^= static_cast(buf_[14]) << 48; case 14: - k2 ^= static_cast(tail[13]) << 40; + k2 ^= static_cast(buf_[13]) << 40; case 13: - k2 ^= static_cast(tail[12]) << 32; + k2 ^= static_cast(buf_[12]) << 32; case 12: - k2 ^= static_cast(tail[11]) << 24; + k2 ^= static_cast(buf_[11]) << 24; case 11: - k2 ^= static_cast(tail[10]) << 16; + k2 ^= static_cast(buf_[10]) << 16; case 10: - k2 ^= static_cast(tail[9]) << 8; + k2 ^= static_cast(buf_[9]) << 8; case 9: - k2 ^= static_cast(tail[8]); + k2 ^= static_cast(buf_[8]); k2 *= c2; k2 = detail::rotl(k2, 33); k2 *= c1; - h2 ^= k2; + h2_ ^= k2; case 8: - k1 ^= static_cast(tail[7]) << 56; + k1 ^= static_cast(buf_[7]) << 56; case 7: - k1 ^= static_cast(tail[6]) << 48; + k1 ^= static_cast(buf_[6]) << 48; case 6: - k1 ^= static_cast(tail[5]) << 40; + k1 ^= static_cast(buf_[5]) << 40; case 5: - k1 ^= static_cast(tail[4]) << 32; + k1 ^= static_cast(buf_[4]) << 32; case 4: - k1 ^= static_cast(tail[3]) << 24; + k1 ^= static_cast(buf_[3]) << 24; case 3: - k1 ^= static_cast(tail[2]) << 16; + k1 ^= static_cast(buf_[2]) << 16; case 2: - k1 ^= static_cast(tail[1]) << 8; + k1 ^= static_cast(buf_[1]) << 8; case 1: - k1 ^= static_cast(tail[0]); + k1 ^= static_cast(buf_[0]); k1 *= c1; k1 = detail::rotl(k1, 31); k1 *= c2; - h1 ^= k1; + h1_ ^= k1; } - h1 ^= len; - h2 ^= len; + h1_ ^= total_length_; + h2_ ^= total_length_; - h1 += h2; - h2 += h1; + h1_ += h2_; + h2_ += h1_; - h1 = detail::fmix(h1); - h2 = detail::fmix(h2); + h1_ = detail::fmix(h1_); + h2_ = detail::fmix(h2_); - h1 += h2; + h1_ += h2_; // h2 += h1, unneeded since we only want 64-bits. - return h1; + return h1_; } +}; - private: - uint64_t seed_; +template +struct is_contiguously_hashable +{ + const static constexpr bool value = std::is_integral::value + || std::is_enum::value + || std::is_pointer::value; +}; + +template +struct is_contiguously_hashable : public is_contiguously_hashable +{ +}; + +template +struct is_contiguously_hashable + : public is_contiguously_hashable +{ +}; + +template +struct is_contiguously_hashable : public is_contiguously_hashable +{ +}; + +template +struct is_contiguously_hashable> +{ + const static constexpr bool value = is_contiguously_hashable::value + && is_contiguously_hashable::value + && sizeof(T) + sizeof(U) + == sizeof(std::pair); +}; + +template +struct is_contiguously_hashable> +{ + const static constexpr bool value + = detail::static_and::value...>::value + && detail::static_add::value + == sizeof(std::tuple); +}; + +template +struct is_contiguously_hashable> +{ + const static constexpr bool value = is_contiguously_hashable::value + && sizeof(T) * N + == sizeof(std::array); +}; + +template +inline typename std::enable_if::value>::type + hash_append(HashAlgorithm& h, const T& t) +{ + h(std::addressof(t), sizeof(t)); +} + +template +inline typename std::enable_if::value>::type + hash_append(HashAlgorithm& h, T t) +{ + // -0 and 0 are the same, but have different bit patterns, so normalize + // to positive zero before hashing + if (t == 0) + t = 0; + h(std::addressof(t), sizeof(t)); +} + +template +inline void hash_append(HashAlgorithm& h, std::nullptr_t) +{ + const void* p = nullptr; + h(std::addressof(p), sizeof(p)); +} + +// all of these hash_appends below need to be forward declared so they can +// find one another in their implementations + +template +typename std::enable_if::value>::type + hash_append(HashAlgorithm& h, T(&a)[N]); + +template +typename std::enable_if>::value>::type + hash_append(HashAlgorithm& h, const std::pair& p); + +template +typename std::enable_if>::value>:: + type + hash_append(HashAlgorithm& h, const std::tuple& t); + +template +typename std::enable_if>::value>:: + type + hash_append(HashAlgorithm& h, const std::array& a); + +template +typename std::enable_if::value>::type + hash_append(HashAlgorithm& h, + const std::basic_string& s); + +template +typename std::enable_if::value>::type + hash_append(HashAlgorithm& h, + const std::basic_string& s); + +template +void hash_append(HashAlgorithm& h, const T1& first, const T2& second, + const Ts&... ts); + +// begin implementations for hash_append + +template +typename std::enable_if::value>::type + hash_append(HashAlgorithm& h, T(&a)[N]) +{ + for (const auto& t : a) + hash_append(h, t); +} + +template +typename std::enable_if>::value>::type + hash_append(HashAlgorithm& h, const std::pair& p) +{ + hash_append(h, p.first, p.second); +} + +namespace detail +{ +// @see +// http://stackoverflow.com/questions/7858817/unpacking-a-tuple-to-call-a-matching-function-pointer +template +struct sequence; + +template +struct generate : generate +{ + // nothing +}; + +template +struct generate<0, S...> +{ + using type = sequence; +}; + +template +void hash_tuple(HashAlgorithm& h, const std::tuple& t, sequence) +{ + hash_append(h, std::get(t)...); +} +} + +template +typename std::enable_if>::value>:: + type + hash_append(HashAlgorithm& h, const std::tuple& t) +{ + detail::hash_tuple(h, t, typename detail::generate::type{}); +} + +template +typename std::enable_if>::value>:: + type + hash_append(HashAlgorithm& h, const std::array& a) +{ + for (const auto& t : a) + hash_append(h, a); +} + +template +typename std::enable_if::value>::type + hash_append(HashAlgorithm& h, + const std::basic_string& s) +{ + h(s.data(), s.size() * sizeof(Char)); + hash_append(h, s.size()); +} + +template +typename std::enable_if::value>::type + hash_append(HashAlgorithm& h, + const std::basic_string& s) +{ + for (const auto& c : s) + hash_append(h, c); + hash_append(h, s.size()); +} + +template +void hash_append(HashAlgorithm& h, const T1& first, const T2& second, + const Ts&... ts) +{ + hash_append(h, first); + hash_append(h, second, ts...); +} + +namespace detail +{ +inline uint64_t get_process_seed() +{ + static uint64_t seed = std::random_device{}(); + return seed; +} +} + +/** + * A generic, randomly seeded hash function. + * @see + * http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n3980.html#seeding + */ +template > +struct hash +{ + using result_type = typename HashAlgorithm::result_type; + + template + result_type operator()(const T& t) const + { + auto seed = detail::get_process_seed(); + HashAlgorithm h(seed); + using util::hash_append; + hash_append(h, t); + return static_cast(h); + } }; } } diff --git a/include/util/string_view.h b/include/util/string_view.h index 0150674e2..51530a5c3 100644 --- a/include/util/string_view.h +++ b/include/util/string_view.h @@ -10,6 +10,8 @@ #ifndef META_UTIL_STRING_VIEW_H_ #define META_UTIL_STRING_VIEW_H_ +#include "util/hash.h" + #if META_HAS_EXPERIMENTAL_STRING_VIEW #include namespace meta @@ -31,8 +33,6 @@ using wstring_view = basic_string_view; #include #include -#include "util/hash.h" - namespace meta { namespace util @@ -627,24 +627,34 @@ std::basic_ostream& namespace std { - template struct hash> + : public meta::util::hash<> { -#if META_HAS_NONEMPTY_HASH_SUPPORT - meta::util::murmur_hash<> hasher; -#endif - - size_t operator()( - const meta::util::basic_string_view& view) const noexcept - { -#ifndef META_HAS_NONEMPTY_HASH_SUPPORT - meta::util::murmur_hash<> hasher{97562527}; -#endif - return hasher(reinterpret_cast(view.data()), - view.size()); - } }; } #endif // !META_HAS_EXPERIMENTAL_STRING_VIEW + +namespace meta +{ +namespace util +{ +template +typename std::enable_if::value>::type + hash_append(HashAlgorithm& h, const basic_string_view& s) +{ + h(s.data(), s.size() * sizeof(Char)); + hash_append(h, s.size()); +} + +template +typename std::enable_if::value>::type + hash_append(HashAlgorithm& h, const basic_string_view& s) +{ + for (const auto& c : s) + hash_append(h, c); + hash_append(h, s.size()); +} +} +} #endif // META_UTIL_STRING_VIEW_H_ From 77b5579ae76a1ded9ee203cccd2a48924f248c10 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 8 Sep 2015 02:01:05 -0500 Subject: [PATCH 261/481] Update porter2_stemmer version. --- deps/porter2_stemmer | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/porter2_stemmer b/deps/porter2_stemmer index 06891b3bf..d98a0332d 160000 --- a/deps/porter2_stemmer +++ b/deps/porter2_stemmer @@ -1 +1 @@ -Subproject commit 06891b3bf8daf7cb8ca56954aff646a86f20724b +Subproject commit d98a0332d2bc28e8d38620658d0f7b58677983cd From f451e147dbe9e9a790c661975edfc05f3ed24e30 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 8 Sep 2015 02:04:30 -0500 Subject: [PATCH 262/481] Bump porter2_stemmer version. --- deps/porter2_stemmer | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/porter2_stemmer b/deps/porter2_stemmer index a9718c892..d98a0332d 160000 --- a/deps/porter2_stemmer +++ b/deps/porter2_stemmer @@ -1 +1 @@ -Subproject commit a9718c892a935baff774dfb450f21e864a14c311 +Subproject commit d98a0332d2bc28e8d38620658d0f7b58677983cd From 4c13d2ec26de53dc9c1695d377f3c6757e2deb02 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Tue, 8 Sep 2015 20:12:14 -0500 Subject: [PATCH 263/481] use new hash_combine and use vector instead of deque in lm::sentence (better == performance) --- include/lm/diff.h | 3 ++- include/lm/sentence.h | 50 +++++++++-------------------------- include/lm/static_probe_map.h | 14 ++++------ src/lm/sentence.cpp | 12 +++------ src/lm/static_probe_map.cpp | 8 +++--- 5 files changed, 28 insertions(+), 59 deletions(-) diff --git a/include/lm/diff.h b/include/lm/diff.h index 346290fa2..4534ab2c0 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -14,6 +14,7 @@ #include #include "cpptoml.h" #include "lm/language_model.h" +#include "util/hash.h" namespace meta { @@ -186,7 +187,7 @@ class diff /// Keeps track of sentences that have already been generated so we don't /// perform redundant calculations - std::unordered_set seen_; + std::unordered_set> seen_; /// How many candidate sentences to store when calling diff::candidates uint64_t max_cand_size_; diff --git a/include/lm/sentence.h b/include/lm/sentence.h index 9629ea756..4fb231014 100644 --- a/include/lm/sentence.h +++ b/include/lm/sentence.h @@ -27,9 +27,9 @@ namespace lm class sentence { public: - using iterator = std::deque::iterator; - using const_iterator = std::deque::const_iterator; - using size_type = std::deque::size_type; + using iterator = std::vector::iterator; + using const_iterator = std::vector::const_iterator; + using size_type = std::vector::size_type; /** * Default constructor; an empty sentence. @@ -105,7 +105,7 @@ class sentence /** * @return the sequence of tokens that comprise this sentence */ - const std::deque& tokens() const; + const std::vector& tokens() const; /** * @return the token at the front of the sentence @@ -139,12 +139,6 @@ class sentence */ void pop_back(); - /** - * Emplaces a token at the beginning of the sentence - */ - template - void emplace_front(Args&&... args); - /** * Emplaces a token at the end of the sentence */ @@ -178,7 +172,7 @@ class sentence private: /// The tokens (words) in the sentence - std::deque tokens_; + std::vector tokens_; /// String representations of the sequence of edit oeprations performed std::vector ops_; @@ -201,34 +195,16 @@ inline bool operator!=(const sentence& lhs, const sentence& rhs) { return !(lhs == rhs); } -} -} -namespace std -{ -template <> -struct hash +template +void hash_append(HashAlgorithm& h, const sentence& s) { -#if META_HAS_NONEMPTY_HASH_SUPPORT - meta::util::murmur_hash<> hasher; -#endif - - size_t operator()(const meta::lm::sentence& sent) const noexcept - { -#ifndef META_HAS_NONEMPTY_HASH_SUPPORT - meta::util::murmur_hash<> hasher{89122527}; -#endif - // create a vector of hashes of all the tokens in the sentence - std::vector hashed; - for (const auto& word : sent) - hashed.push_back(hasher( - reinterpret_cast(word.data()), word.size())); - - // hash the hashes as sequences of uint8_ts - return hasher(reinterpret_cast(hashed.data()), - hashed.size() * sizeof(std::size_t)); - } -}; + using util::hash_append; + for (const auto& word : s) + hash_append(h, word); + hash_append(h, s.size()); +} +} } #endif diff --git a/include/lm/static_probe_map.h b/include/lm/static_probe_map.h index 24a1c7045..2a812b264 100644 --- a/include/lm/static_probe_map.h +++ b/include/lm/static_probe_map.h @@ -14,7 +14,6 @@ #include "lm/lm_node.h" #include "util/disk_vector.h" #include "util/optional.h" -#include "util/hash.h" namespace meta { @@ -63,19 +62,16 @@ class static_probe_map void insert(const std::string& key, float prob, float backoff); private: + /** + * Helper function to create hasher and hash str + */ + uint64_t hash(const std::string& str) const; + /// A seed for the string hash function static constexpr uint64_t seed_ = 0x2bedf99b3aa222d9; /// The internal map representing std::string -> lm_node pairs util::disk_vector table_; - - /// 64-bit hash function for strings - util::murmur_hash<> hash_; - - /** - * Helper function to hash a string with util::murmur_hash - */ - uint64_t hash(const std::string& str) const; }; /** diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 68426ac8d..94a8616f6 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -110,7 +110,7 @@ const std::vector& sentence::operations() const return ops_; } -const std::deque& sentence::tokens() const +const std::vector& sentence::tokens() const { return tokens_; } @@ -127,12 +127,12 @@ const std::string& sentence::back() const void sentence::push_front(const std::string& token) { - tokens_.push_front(token); + tokens_.insert(tokens_.begin(), token); } void sentence::pop_front() { - tokens_.pop_front(); + tokens_.erase(tokens_.begin()); } void sentence::push_back(const std::string& token) @@ -145,12 +145,6 @@ void sentence::pop_back() tokens_.pop_back(); } -template -void sentence::emplace_front(Args&&... args) -{ - tokens_.emplace_front(std::forward(args)...); -} - template void sentence::emplace_back(Args&&... args) { diff --git a/src/lm/static_probe_map.cpp b/src/lm/static_probe_map.cpp index 222e8d1a0..920603086 100644 --- a/src/lm/static_probe_map.cpp +++ b/src/lm/static_probe_map.cpp @@ -5,6 +5,7 @@ #include #include "lm/static_probe_map.h" +#include "util/hash.h" namespace meta { @@ -12,8 +13,7 @@ namespace lm { static_probe_map::static_probe_map(const std::string& filename, uint64_t num_elems) - : table_{filename, static_cast((num_elems / 0.7) * 2)}, - hash_{seed_} + : table_{filename, static_cast((num_elems / 0.7) * 2)} // load factor of 0.7; x2 for keys and vals { } @@ -64,7 +64,9 @@ util::optional static_probe_map::find(const std::string& key) const uint64_t static_probe_map::hash(const std::string& str) const { - return hash_(reinterpret_cast(str.c_str()), str.size()); + util::murmur_hash<> hasher{seed_}; + hasher(str.data(), str.length()); + return static_cast(hasher); } } } From fa3ae9580d5b00aad5208e4ff868b79cda5b3f34 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 9 Sep 2015 10:30:35 -0500 Subject: [PATCH 264/481] explanatory comments for the choice of std::vector in lm::sentence --- src/lm/sentence.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/lm/sentence.cpp b/src/lm/sentence.cpp index 94a8616f6..d98124586 100644 --- a/src/lm/sentence.cpp +++ b/src/lm/sentence.cpp @@ -127,11 +127,17 @@ const std::string& sentence::back() const void sentence::push_front(const std::string& token) { + // we use a std::vector instead of a std::deque because sentence's + // operator== (using std::vector) is significantly faster; push_front is not + // called as much as operator== tokens_.insert(tokens_.begin(), token); } void sentence::pop_front() { + // we use a std::vector instead of a std::deque because sentence's + // operator== (using std::vector) is significantly faster; pop_front is not + // called as much as operator== tokens_.erase(tokens_.begin()); } From f671b763bc22a2db40d386df6a685882deab28c7 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 9 Sep 2015 10:49:09 -0500 Subject: [PATCH 265/481] move all lm_node logic into the struct itself, and remove any potential endianness errors re: #107 --- include/lm/lm_node.h | 36 +++++++++++++++++++++++++++++++++--- src/lm/static_probe_map.cpp | 8 +------- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/include/lm/lm_node.h b/include/lm/lm_node.h index 311e7e788..7ca2f5992 100644 --- a/include/lm/lm_node.h +++ b/include/lm/lm_node.h @@ -23,26 +23,56 @@ namespace lm */ struct lm_node { + /** + * Default constructor. + */ lm_node() : prob{0.0f}, backoff{0.0f} { } + /** + * Parameter constructor. + * @param p The probability value + * @param b The backoff value + */ lm_node(float p, float b) : prob{p}, backoff{b} { } + /** + * Constructor that takes a packed [prob][backoff] uint64_t to construct + * this node + * @param packed + */ lm_node(uint64_t packed) { - uint32_t buf = packed >> 32; - std::memcpy(&prob, &packed, sizeof(float)); - std::memcpy(&backoff, &buf, sizeof(float)); + char* buf = reinterpret_cast(&packed); + std::memcpy(&prob, buf, sizeof(float)); + std::memcpy(&backoff, buf + sizeof(float), sizeof(float)); } + /** + * Equality operator defined so lm_node can be used in a dictionary + */ bool operator==(const lm_node& other) const { return prob == other.prob && backoff == other.backoff; } + /** + * @param p The probability value + * @param b The backoff value + * @return a packed uint64_t containing [prob][backoff] + */ + static uint64_t write_packed(float p, float b) + { + uint64_t packed; + char* buf = reinterpret_cast(&packed); + std::memcpy(buf, &p, sizeof(float)); + std::memcpy(buf + sizeof(float), &b, sizeof(float)); + return packed; + } + float prob; float backoff; }; diff --git a/src/lm/static_probe_map.cpp b/src/lm/static_probe_map.cpp index 920603086..ddd536f1e 100644 --- a/src/lm/static_probe_map.cpp +++ b/src/lm/static_probe_map.cpp @@ -3,7 +3,6 @@ * @author Sean Massung */ -#include #include "lm/static_probe_map.h" #include "util/hash.h" @@ -28,12 +27,7 @@ void static_probe_map::insert(const std::string& key, float prob, float backoff) if (table_[idx] == uint64_t{0}) { table_[idx] = hashed; - - // pack prob and float into uint64_t slot next to key val - uint64_t buf = 0; - std::memcpy(&table_[idx + 1], &prob, sizeof(float)); - std::memcpy(&buf, &backoff, sizeof(float)); - table_[idx + 1] |= (buf << 32); + table_[idx + 1] = lm_node::write_packed(prob, backoff); return; } From d309415e3388df0f0e7bfc0916d08059ffb567ce Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Wed, 9 Sep 2015 11:06:10 -0500 Subject: [PATCH 266/481] make feature_selector::init private and friend make_selector re: #107 --- include/features/feature_selector.h | 25 ++++++++++++++++--------- src/features/selector_factory.cpp | 2 +- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/features/feature_selector.h b/include/features/feature_selector.h index da799aac8..5646ee908 100644 --- a/include/features/feature_selector.h +++ b/include/features/feature_selector.h @@ -14,8 +14,9 @@ #include #include -#include "util/disk_vector.h" +#include "cpptoml.h" #include "index/forward_index.h" +#include "util/disk_vector.h" namespace meta { @@ -47,14 +48,6 @@ class feature_selector feature_selector(const std::string& prefix, std::shared_ptr idx); - /** - * Creates the state of this feature_selector if necessary; this logic is - * outside the constructor since it requires pure virtual functions - * implemented by deriving classes. - * @param features_per_class - */ - void init(uint64_t features_per_class); - /** * Default destructor. */ @@ -154,6 +147,20 @@ class feature_selector * scores, implemented as derived classes. */ + /** + * Creates the state of this feature_selector if necessary; this logic is + * outside the constructor since it requires pure virtual functions + * implemented by deriving classes. + * @param features_per_class + */ + void init(uint64_t features_per_class); + + /// friend the factory function used to create feature_selectors, since + /// they need to call the init + friend std::unique_ptr + make_selector(const cpptoml::table& config, + std::shared_ptr idx); + /** * Calculates the probabilities of terms and classes given the current * index. diff --git a/src/features/selector_factory.cpp b/src/features/selector_factory.cpp index a308ea954..6edf78841 100644 --- a/src/features/selector_factory.cpp +++ b/src/features/selector_factory.cpp @@ -52,7 +52,7 @@ std::unique_ptr auto selector = selector_factory::get().create(*method, *table, std::move(idx)); - selector->init(features_per_class); + selector->init(features_per_class); // make_selector is a friend return selector; } } From e24cc832d261650de90ae5d22a2fe02af3dfd06e Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 11 Sep 2015 20:31:01 -0500 Subject: [PATCH 267/481] Fix issue with potentially empty chunks in direct forward index creation. --- src/index/forward_index.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 603017f8e..7e81580bc 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -392,7 +392,13 @@ void forward_index::impl::merge_chunks(size_t num_chunks, std::vector chunks; chunks.reserve(num_chunks); for (size_t i = 0; i < num_chunks; ++i) - chunks.emplace_back(idx_->index_name() + "/chunk-" + std::to_string(i)); + { + auto filename = idx_->index_name() + "/chunk-" + std::to_string(i); + if (filesystem::file_exists(filename) + && filesystem::file_size(filename) > 0) + chunks.emplace_back(idx_->index_name() + "/chunk-" + + std::to_string(i)); + } printing::progress progress{ " > Merging postings: ", From 9b995a1a58302df09be724b173054207bee1c268 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 11 Sep 2015 20:31:30 -0500 Subject: [PATCH 268/481] Remove interfaces that directly parsed configuration files. Instead, take a cpptoml::table directly and move the parsing of config to client code. This should enable nicer configuration generation in applications that e.g. don't have an explicit configuration file. --- deps/cpptoml | 2 +- include/index/cached_index.h | 5 +- include/index/cached_index.tcc | 7 +- include/index/eval/ir_eval.h | 4 +- include/index/forward_index.h | 6 +- include/index/inverted_index.h | 8 +- include/index/make_index.h | 42 ++++--- include/test/forward_index_test.h | 6 +- include/test/inverted_index_test.h | 4 +- src/analyzers/tools/tokenize_test.cpp | 4 +- src/classify/tools/classify.cpp | 6 +- src/classify/tools/online-classify.cpp | 8 +- src/corpus/corpus.cpp | 16 +-- src/corpus/tools/corpus-gen.cpp | 6 +- src/features/tools/feature_summary.cpp | 6 +- src/index/eval/ir_eval.cpp | 3 +- src/index/forward_index.cpp | 16 +-- src/index/inverted_index.cpp | 12 +- src/index/tools/forward-to-libsvm.cpp | 3 +- src/index/tools/index.cpp | 3 +- src/index/tools/interactive-search.cpp | 10 +- src/index/tools/query-runner.cpp | 10 +- src/index/tools/search.cpp | 8 +- src/lm/tools/diff_test.cpp | 3 +- src/parser/tools/parser_test.cpp | 4 +- src/parser/tools/parser_train.cpp | 4 +- src/parser/tools/sr_parse.cpp | 4 +- src/sequence/crf/tools/crf-test.cpp | 4 +- src/sequence/crf/tools/crf-train.cpp | 4 +- src/sequence/crf/tools/pos_tag.cpp | 2 +- src/sequence/crf/tools/pos_tokenizer.cpp | 4 +- src/sequence/tools/greedy_tagger_test.cpp | 4 +- src/sequence/tools/greedy_tagger_train.cpp | 4 +- src/test/analyzer_test.cpp | 5 +- src/test/classifier_test.cpp | 20 ++-- src/test/features_test.cpp | 19 ++- src/test/forward_index_test.cpp | 129 ++++++++++----------- src/test/inverted_index_test.cpp | 97 ++++++++-------- src/test/ir_eval_test.cpp | 11 +- src/test/lm_test.cpp | 4 +- src/test/ranker_test.cpp | 47 ++++---- src/tools/profile.cpp | 16 +-- src/tools/top_k.cpp | 6 +- src/topics/tools/lda-topics.cpp | 7 +- src/topics/tools/lda.cpp | 6 +- src/topics/tools/topic_corpus.cpp | 6 +- 46 files changed, 300 insertions(+), 305 deletions(-) diff --git a/deps/cpptoml b/deps/cpptoml index 70b977247..caf04b833 160000 --- a/deps/cpptoml +++ b/deps/cpptoml @@ -1 +1 @@ -Subproject commit 70b977247d97a5d0766c1d3f91a0ad0427970cf9 +Subproject commit caf04b8336c91b0fa2752a8d03b48ed6d72570f8 diff --git a/include/index/cached_index.h b/include/index/cached_index.h index 815e9cc6d..3dec4adc8 100644 --- a/include/index/cached_index.h +++ b/include/index/cached_index.h @@ -30,9 +30,6 @@ template class Cache> class cached_index : public Index { public: - /// inherit the constructors - using Index::Index; - /** * Forwarding constructor: construct the Index part using the * config, but then forward the additional arguments to the @@ -44,7 +41,7 @@ class cached_index : public Index * constructor */ template - cached_index(cpptoml::table& config, Args&&... args); + cached_index(const cpptoml::table& config, Args&&... args); using primary_key_type = typename Index::primary_key_type; using secondary_key_type = typename Index::secondary_key_type; diff --git a/include/index/cached_index.tcc b/include/index/cached_index.tcc index 31373e7d5..ba5526ad2 100644 --- a/include/index/cached_index.tcc +++ b/include/index/cached_index.tcc @@ -12,15 +12,16 @@ namespace index template class Cache> template -cached_index::cached_index(cpptoml::table& config, Args&&... args) +cached_index::cached_index(const cpptoml::table& config, + Args&&... args) : Index{config}, cache_(std::forward(args)...) { /* nothing */ } template class Cache> -auto cached_index::search_primary( - primary_key_type p_id) const -> std::shared_ptr +auto cached_index::search_primary(primary_key_type p_id) const + -> std::shared_ptr { auto opt = cache_.find(p_id); if (opt) diff --git a/include/index/eval/ir_eval.h b/include/index/eval/ir_eval.h index 71f98946c..19e9d048f 100644 --- a/include/index/eval/ir_eval.h +++ b/include/index/eval/ir_eval.h @@ -34,9 +34,9 @@ class ir_eval using result_type = std::vector; /** - * @param config_file Path to cpptoml configuration file + * @param config Configuration group */ - ir_eval(const std::string& config_file); + ir_eval(const cpptoml::table& config); /** * @param results The ranked list of results diff --git a/include/index/forward_index.h b/include/index/forward_index.h index 4d1b33340..40fc2ef23 100644 --- a/include/index/forward_index.h +++ b/include/index/forward_index.h @@ -60,7 +60,7 @@ class forward_index : public disk_index * it. */ template - friend std::shared_ptr make_index(const std::string& config_file, + friend std::shared_ptr make_index(const cpptoml::table& config, Args&&... args); /** @@ -69,7 +69,7 @@ class forward_index : public disk_index */ template class Cache, class... Args> friend std::shared_ptr> - make_index(const std::string& config_file, Args&&... args); + make_index(const cpptoml::table& config_file, Args&&... args); using primary_key_type = doc_id; using secondary_key_type = term_id; @@ -148,7 +148,7 @@ class forward_index : public disk_index * This function initializes the forward index. * @param config_file The configuration file used to create the index */ - void create_index(const std::string& config_file); + void create_index(const cpptoml::table& config); /** * @return whether this index contains all necessary files diff --git a/include/index/inverted_index.h b/include/index/inverted_index.h index 3ad99c920..738d827aa 100644 --- a/include/index/inverted_index.h +++ b/include/index/inverted_index.h @@ -76,7 +76,7 @@ class inverted_index : public disk_index * it. */ template - friend std::shared_ptr make_index(const std::string&, Args&&...); + friend std::shared_ptr make_index(const cpptoml::table&, Args&&...); /** * inverted_index is a friend of the factory method used to create @@ -84,7 +84,7 @@ class inverted_index : public disk_index */ template class Cache, class... Args> friend std::shared_ptr> - make_index(const std::string& config_file, Args&&... args); + make_index(const cpptoml::table& config, Args&&... args); protected: /** @@ -172,9 +172,9 @@ class inverted_index : public disk_index /** * This function initializes the disk index; it is called by the * make_index factory function. - * @param config_file The configuration to be used + * @param config The configuration to be used */ - void create_index(const std::string& config_file); + void create_index(const cpptoml::table& config); /** * This function loads a disk index from its filesystem diff --git a/include/index/make_index.h b/include/index/make_index.h index 4d17aa564..863ae1926 100644 --- a/include/index/make_index.h +++ b/include/index/make_index.h @@ -25,19 +25,19 @@ class inverted_index; class forward_index; /// Inverted index using default DBLRU cache -using dblru_inverted_index = - cached_index; +using dblru_inverted_index + = cached_index; /// Inverted index using splay cache using splay_inverted_index = cached_index; /// In-memory forward index -using memory_forward_index = - cached_index; +using memory_forward_index + = cached_index; /// Forward index using default DBLRU cache -using dblru_forward_index = - cached_index; +using dblru_forward_index + = cached_index; /// Forward index using splay cache using splay_forward_index = cached_index; @@ -47,21 +47,17 @@ using splay_forward_index = cached_index; * Usage: * * ~~~cpp - * auto idx = index::make_index(config_path); + * auto idx = index::make_index(config); * ~~~ * - * @param config_file The path to the configuration file to be - * used to build the index + * @param config The configuration to be used to build the index * @param args any additional arguments to forward to the * constructor for the chosen index type (usually none) * @return A properly initialized index */ template -std::shared_ptr make_index(const std::string& config_file, - Args&&... args) +std::shared_ptr make_index(const cpptoml::table& config, Args&&... args) { - auto config = cpptoml::parse_file(config_file); - // check if we have paths specified for either kind of index if (!(config.contains("forward-index") && config.contains("inverted-index"))) @@ -80,15 +76,23 @@ std::shared_ptr make_index(const std::string& config_file, "forward and inverted index names must be different!"}; } - // can't use std::make_shared here since the Index constructor is private - auto idx = - std::shared_ptr{new Index(config, std::forward(args)...)}; + // below is needed so that make_shared can find a public ctor to invoke + struct make_shared_enabler : public Index + { + make_shared_enabler(const cpptoml::table& config, Args&&... args) + : Index(config, std::forward(args)...) + { + // nothing + } + }; + auto idx = std::make_shared( + config, std::forward(args)...); // if index has already been made, load it if (filesystem::make_directory(idx->index_name()) && idx->valid()) idx->load_index(); else - idx->create_index(config_file); + idx->create_index(config); return idx; } @@ -114,9 +118,9 @@ std::shared_ptr make_index(const std::string& config_file, */ template class Cache, class... Args> std::shared_ptr> - make_index(const std::string& config_file, Args&&... args) + make_index(const cpptoml::table& config, Args&&... args) { - return make_index>(config_file, + return make_index>(config, std::forward(args)...); } } diff --git a/include/test/forward_index_test.h b/include/test/forward_index_test.h index 7d4273c90..85f40276b 100644 --- a/include/test/forward_index_test.h +++ b/include/test/forward_index_test.h @@ -25,7 +25,7 @@ namespace testing /** * Creates a test-config.toml with the desired settings. */ -void create_libsvm_config(); +std::shared_ptr create_libsvm_config(); /** * Asserts that the bcancer corpus was created correctly. @@ -58,12 +58,12 @@ void check_ceeaus_doc_id(Index& idx); /** * Runs the ceeaus forward index tests. */ -void ceeaus_forward_test(); +void ceeaus_forward_test(const cpptoml::table& conf); /** * Runs the bcancer forward index tests. */ -void bcancer_forward_test(); +void bcancer_forward_test(const cpptoml::table& conf); /** * Runs all the forward_index tests. diff --git a/include/test/inverted_index_test.h b/include/test/inverted_index_test.h index 3db0f9f09..39a3f11d1 100644 --- a/include/test/inverted_index_test.h +++ b/include/test/inverted_index_test.h @@ -23,10 +23,10 @@ namespace meta namespace testing { /** - * Creates test-config.toml with the desired settings. + * Creates a configuration file with the desired settings. * @param corpus_type line or file corpus */ -void create_config(const std::string& corpus_type); +std::shared_ptr create_config(const std::string& corpus_type); /** * Checks that ceeaus index was built correctly. diff --git a/src/analyzers/tools/tokenize_test.cpp b/src/analyzers/tools/tokenize_test.cpp index 63d5d2851..839de7a66 100644 --- a/src/analyzers/tools/tokenize_test.cpp +++ b/src/analyzers/tools/tokenize_test.cpp @@ -24,7 +24,7 @@ int main(int argc, char** argv) std::unique_ptr stream; - auto analyzers = config.get_table_array("analyzers"); + auto analyzers = config->get_table_array("analyzers"); for (const auto& group : analyzers->get()) { auto method = group->get_as("method"); @@ -33,7 +33,7 @@ int main(int argc, char** argv) if (*method == analyzers::ngram_word_analyzer::id) { - stream = analyzers::load_filters(config, *group); + stream = analyzers::load_filters(*config, *group); break; } } diff --git a/src/classify/tools/classify.cpp b/src/classify/tools/classify.cpp index cedeb8d51..81c1c4e7a 100644 --- a/src/classify/tools/classify.cpp +++ b/src/classify/tools/classify.cpp @@ -77,14 +77,14 @@ int main(int argc, char* argv[]) sequence::register_analyzers(); auto config = cpptoml::parse_file(argv[1]); - auto class_config = config.get_table("classifier"); + auto class_config = config->get_table("classifier"); if (!class_config) { cerr << "Missing classifier configuration group in " << argv[1] << endl; return 1; } - auto f_idx = index::make_index(argv[1]); + auto f_idx = index::make_index(*config); auto docs = f_idx->docs(); printing::progress progress{" > Pre-fetching for cache: ", docs.size()}; @@ -100,7 +100,7 @@ int main(int argc, char* argv[]) auto classifier_method = *class_config->get_as("method"); if (classifier_method == "knn" || classifier_method == "nearest-centroid") { - auto i_idx = index::make_index(argv[1]); + auto i_idx = index::make_index(*config); classifier = classify::make_classifier(*class_config, f_idx, i_idx); } else diff --git a/src/classify/tools/online-classify.cpp b/src/classify/tools/online-classify.cpp index 51e13c2a3..e209d37b2 100644 --- a/src/classify/tools/online-classify.cpp +++ b/src/classify/tools/online-classify.cpp @@ -27,7 +27,7 @@ int main(int argc, char* argv[]) sequence::register_analyzers(); auto config = cpptoml::parse_file(argv[1]); - auto class_config = config.get_table("classifier"); + auto class_config = config->get_table("classifier"); if (!class_config) { std::cerr << "Missing classifier configuration group in " << argv[1] @@ -35,21 +35,21 @@ int main(int argc, char* argv[]) return 1; } - auto batch_size = config.get_as("batch-size"); + auto batch_size = config->get_as("batch-size"); if (!batch_size) { std::cerr << "Missing batch-size in " << argv[1] << std::endl; return 1; } - auto test_start = config.get_as("test-start"); + auto test_start = config->get_as("test-start"); if (!test_start) { std::cerr << "Missing test-start in " << argv[1] << std::endl; return 1; } - auto f_idx = index::make_index(argv[1]); + auto f_idx = index::make_index(*config); if (static_cast(*test_start) > f_idx->num_docs()) { diff --git a/src/corpus/corpus.cpp b/src/corpus/corpus.cpp index da969fa1f..6f7c54f7b 100644 --- a/src/corpus/corpus.cpp +++ b/src/corpus/corpus.cpp @@ -39,12 +39,6 @@ void corpus::set_metadata_parser(metadata_parser&& parser) mdata_parser_ = std::move(parser); } -std::unique_ptr corpus::load(const std::string& config_file) -{ - auto config = cpptoml::parse_file(config_file); - return load(config); -} - std::unique_ptr corpus::load(const cpptoml::table& config) { auto corp = config.get_as("corpus"); @@ -65,17 +59,17 @@ std::unique_ptr corpus::load(const cpptoml::table& config) + ") not present"}; auto corpus_config = cpptoml::parse_file(corpus_filename); - auto type = corpus_config.get_as("type"); + auto type = corpus_config->get_as("type"); if (!type) throw corpus_exception{"type missing from corpus configuration file"}; auto encoding - = corpus_config.get_as("encoding").value_or("utf-8"); + = corpus_config->get_as("encoding").value_or("utf-8"); std::unique_ptr result; if (*type == "file-corpus") { - auto file_list = corpus_config.get_as("list"); + auto file_list = corpus_config->get_as("list"); if (!file_list) throw corpus_exception{ "list missing from corpus configuration file"}; @@ -89,7 +83,7 @@ std::unique_ptr corpus::load(const cpptoml::table& config) { std::string filename = *prefix + "/" + *dataset + "/" + *dataset + ".dat"; - auto lines = corpus_config.get_as("num-lines"); + auto lines = corpus_config->get_as("num-lines"); if (!lines) result = make_unique(filename, encoding); else @@ -108,7 +102,7 @@ std::unique_ptr corpus::load(const cpptoml::table& config) throw corpus_exception{"corpus type was not able to be determined"}; result->set_metadata_parser({*prefix + "/" + *dataset + "/metadata.dat", - metadata_schema(corpus_config)}); + metadata_schema(*corpus_config)}); return result; } } diff --git a/src/corpus/tools/corpus-gen.cpp b/src/corpus/tools/corpus-gen.cpp index 5a8f02c11..6f84f0582 100644 --- a/src/corpus/tools/corpus-gen.cpp +++ b/src/corpus/tools/corpus-gen.cpp @@ -62,15 +62,15 @@ int main(int argc, char* argv[]) } auto config = cpptoml::parse_file(argv[1]); - auto prefix = config.get_as("prefix"); + auto prefix = config->get_as("prefix"); if (!prefix) throw std::runtime_error{"prefix missing from configuration file"}; - auto dataset = config.get_as("dataset"); + auto dataset = config->get_as("dataset"); if (!dataset) throw std::runtime_error{"dataset missing from configuration file"}; - auto file_list = config.get_as("list"); + auto file_list = config->get_as("list"); if (!file_list) throw std::runtime_error{"list missing from configuration file"}; diff --git a/src/features/tools/feature_summary.cpp b/src/features/tools/feature_summary.cpp index c2912cfe2..c53850d75 100644 --- a/src/features/tools/feature_summary.cpp +++ b/src/features/tools/feature_summary.cpp @@ -27,15 +27,15 @@ int main(int argc, char* argv[]) logging::set_cerr_logging(); auto config = cpptoml::parse_file(argv[1]); - auto feature_config = config.get_table("features"); + auto feature_config = config->get_table("features"); if (!feature_config) { std::cerr << "Missing [features] config table" << std::endl; return 1; } - auto f_idx = index::make_index(argv[1]); - auto selector = features::make_selector(config, f_idx); + auto f_idx = index::make_index(*config); + auto selector = features::make_selector(*config, f_idx); selector->select(100); selector->print_summary(10); } diff --git a/src/index/eval/ir_eval.cpp b/src/index/eval/ir_eval.cpp index 9bf400826..bdb4107b9 100644 --- a/src/index/eval/ir_eval.cpp +++ b/src/index/eval/ir_eval.cpp @@ -19,9 +19,8 @@ namespace meta namespace index { -ir_eval::ir_eval(const std::string& config_file) +ir_eval::ir_eval(const cpptoml::table& config) { - auto config = cpptoml::parse_file(config_file); auto path = config.get_as("query-judgements"); if (!path) throw ir_eval_exception{"query judgement file was not specified"}; diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 7e81580bc..0e69b4599 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -176,7 +176,7 @@ void forward_index::load_index() impl_->load_labels(); auto config = cpptoml::parse_file(index_name() + "/config.toml"); - if (!fwd_impl_->is_libsvm_format(config)) + if (!fwd_impl_->is_libsvm_format(*config)) impl_->load_term_id_mapping(); impl_->load_label_id_mapping(); @@ -186,10 +186,12 @@ void forward_index::load_index() unique_terms_file >> fwd_impl_->total_unique_terms_; } -void forward_index::create_index(const std::string& config_file) +void forward_index::create_index(const cpptoml::table& config) { - filesystem::copy_file(config_file, index_name() + "/config.toml"); - auto config = cpptoml::parse_file(index_name() + "/config.toml"); + { + std::ofstream config_file{index_name() + "/config.toml"}; + config_file << config; + } // if the corpus is a single libsvm formatted file, then we are done; // otherwise, we will create an inverted index and the uninvert it @@ -212,9 +214,9 @@ void forward_index::create_index(const std::string& config_file) << ENDLG; { // Ensure all files are flushed before uninverting - make_index(config_file); + make_index(config); } - auto inv_idx = make_index(config_file); + auto inv_idx = make_index(config); fwd_impl_->create_uninverted_metadata(inv_idx->index_name()); // RAM budget is given in MB @@ -226,7 +228,7 @@ void forward_index::create_index(const std::string& config_file) { LOG(info) << "Creating forward index: " << index_name() << ENDLG; - auto docs = corpus::corpus::load(config_file); + auto docs = corpus::corpus::load(config); { auto analyzer = analyzers::load(config); diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 980d76775..3e183c213 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -109,17 +109,19 @@ bool inverted_index::valid() const return true; } -void inverted_index::create_index(const std::string& config_file) +void inverted_index::create_index(const cpptoml::table& config) { // save the config file so we can recreate the analyzer - filesystem::copy_file(config_file, index_name() + "/config.toml"); + { + std::ofstream config_file{index_name() + "/config.toml"}; + config_file << config; + } LOG(info) << "Creating index: " << index_name() << ENDLG; // load the documents from the corpus - auto docs = corpus::corpus::load(config_file); + auto docs = corpus::corpus::load(config); - auto config = cpptoml::parse_file(config_file); auto ram_budget = static_cast( config.get_as("indexer-ram-budget").value_or(1024)); @@ -159,8 +161,6 @@ void inverted_index::load_index() { LOG(info) << "Loading index from disk: " << index_name() << ENDLG; - auto config = cpptoml::parse_file(index_name() + "/config.toml"); - impl_->initialize_metadata(); impl_->load_term_id_mapping(); impl_->load_label_id_mapping(); diff --git a/src/index/tools/forward-to-libsvm.cpp b/src/index/tools/forward-to-libsvm.cpp index 38551afe2..1c13a50f6 100644 --- a/src/index/tools/forward-to-libsvm.cpp +++ b/src/index/tools/forward-to-libsvm.cpp @@ -18,7 +18,8 @@ int main(int argc, char** argv) logging::set_cerr_logging(); - auto idx = index::make_index(argv[1]); + auto config = cpptoml::parse_file(argv[1]); + auto idx = index::make_index(*config); { std::ofstream output{argv[2]}; printing::progress progress{" > Converting to libsvm: ", diff --git a/src/index/tools/index.cpp b/src/index/tools/index.cpp index 91838b6be..740c2803f 100644 --- a/src/index/tools/index.cpp +++ b/src/index/tools/index.cpp @@ -37,7 +37,8 @@ int main(int argc, char* argv[]) { // Creates an inverted index with no cache. We don't need a cache here // since we're never searching the index, only building it. - auto idx = index::make_index(argv[1]); + auto config = cpptoml::parse_file(argv[1]); + auto idx = index::make_index(*config); // Print out some data about the corpus. std::cout << "Number of documents: " << idx->num_docs() << std::endl; diff --git a/src/index/tools/interactive-search.cpp b/src/index/tools/interactive-search.cpp index 8317cfd25..01042059f 100644 --- a/src/index/tools/interactive-search.cpp +++ b/src/index/tools/interactive-search.cpp @@ -50,18 +50,18 @@ int main(int argc, char* argv[]) sequence::register_analyzers(); // Create an inverted index based on the config file. - auto idx = index::make_index(argv[1]); + auto config = cpptoml::parse_file(argv[1]); + auto idx = index::make_index(*config); // Create a ranking class based on the config file. - auto config = cpptoml::parse_file(argv[1]); - auto group = config.get_table("ranker"); + auto group = config->get_table("ranker"); if (!group) throw std::runtime_error{"\"ranker\" group needed in config file!"}; auto ranker = index::make_ranker(*group); // Find the path prefix to each document so we can print out the contents. - std::string prefix = *config.get_as("prefix") + "/" - + *config.get_as("dataset") + "/"; + std::string prefix = *config->get_as("prefix") + "/" + + *config->get_as("dataset") + "/"; std::cout << "Enter a query, or blank to quit." << std::endl << std::endl; diff --git a/src/index/tools/query-runner.cpp b/src/index/tools/query-runner.cpp index 7b9ac5c2b..da23ccb47 100644 --- a/src/index/tools/query-runner.cpp +++ b/src/index/tools/query-runner.cpp @@ -38,17 +38,17 @@ int main(int argc, char* argv[]) sequence::register_analyzers(); // Create an inverted index based on the config file - auto idx = index::make_index(argv[1]); + auto config = cpptoml::parse_file(argv[1]); + auto idx = index::make_index(*config); // Create a ranking class based on the config file. - auto config = cpptoml::parse_file(argv[1]); - auto group = config.get_table("ranker"); + auto group = config->get_table("ranker"); if (!group) throw std::runtime_error{"\"ranker\" group needed in config file!"}; auto ranker = index::make_ranker(*group); // Get the path to the file containing queries - auto query_path = config.get_as("query-path"); + auto query_path = config->get_as("query-path"); if (!query_path) throw std::runtime_error{ "config file needs a \"query-path\" parameter"}; @@ -57,7 +57,7 @@ int main(int argc, char* argv[]) std::unique_ptr eval; try { - eval = make_unique(argv[1]); + eval = make_unique(*config); } catch (index::ir_eval::ir_eval_exception& ex) { diff --git a/src/index/tools/search.cpp b/src/index/tools/search.cpp index ae3bd93d1..cf2a8e8b0 100644 --- a/src/index/tools/search.cpp +++ b/src/index/tools/search.cpp @@ -39,18 +39,18 @@ int main(int argc, char* argv[]) sequence::register_analyzers(); // Create an inverted index based on the config file. - auto idx = index::make_index(argv[1]); - auto config = cpptoml::parse_file(argv[1]); + auto idx = index::make_index(*config); + // Create a ranking class based on the config file. - auto group = config.get_table("ranker"); + auto group = config->get_table("ranker"); if (!group) throw std::runtime_error{"\"ranker\" group needed in config file!"}; auto ranker = index::make_ranker(*group); // Use UTF-8 for the default encoding unless otherwise specified. - auto encoding = config.get_as("encoding").value_or("utf-8"); + auto encoding = config->get_as("encoding").value_or("utf-8"); // Time how long it takes to create the index. By default, common::time's // unit of measurement is milliseconds. diff --git a/src/lm/tools/diff_test.cpp b/src/lm/tools/diff_test.cpp index 63f7a6252..7b9d3dd8e 100644 --- a/src/lm/tools/diff_test.cpp +++ b/src/lm/tools/diff_test.cpp @@ -25,7 +25,8 @@ int main(int argc, char* argv[]) logging::set_cerr_logging(); - lm::diff correcter{cpptoml::parse_file(argv[1])}; + auto config = cpptoml::parse_file(argv[1]); + lm::diff correcter{*config}; std::ifstream in{argv[2]}; auto num_sentences = filesystem::num_lines(argv[2]); printing::progress prog{"Editing sentences ", num_sentences}; diff --git a/src/parser/tools/parser_test.cpp b/src/parser/tools/parser_test.cpp index 1e4afec96..97e3eb76f 100644 --- a/src/parser/tools/parser_test.cpp +++ b/src/parser/tools/parser_test.cpp @@ -38,14 +38,14 @@ int main(int argc, char** argv) auto config = cpptoml::parse_file(argv[1]); - auto prefix = config.get_as("prefix"); + auto prefix = config->get_as("prefix"); if (!prefix) { LOG(fatal) << "Global configuration must have a prefix key" << ENDLG; return 1; } - auto parser_grp = config.get_table("parser"); + auto parser_grp = config->get_table("parser"); if (!parser_grp) { LOG(fatal) << "Configuration must contain a [parser] group" << ENDLG; diff --git a/src/parser/tools/parser_train.cpp b/src/parser/tools/parser_train.cpp index 043d5a650..abbf83e9e 100644 --- a/src/parser/tools/parser_train.cpp +++ b/src/parser/tools/parser_train.cpp @@ -34,14 +34,14 @@ int main(int argc, char** argv) auto config = cpptoml::parse_file(argv[1]); - auto prefix = config.get_as("prefix"); + auto prefix = config->get_as("prefix"); if (!prefix) { LOG(fatal) << "Global configuration must have a prefix key" << ENDLG; return 1; } - auto parser_grp = config.get_table("parser"); + auto parser_grp = config->get_table("parser"); if (!parser_grp) { LOG(fatal) << "Configuration must contain a [parser] group" << ENDLG; diff --git a/src/parser/tools/sr_parse.cpp b/src/parser/tools/sr_parse.cpp index f02937a62..a0201c0de 100644 --- a/src/parser/tools/sr_parse.cpp +++ b/src/parser/tools/sr_parse.cpp @@ -25,7 +25,7 @@ int main(int argc, char* argv[]) logging::set_cerr_logging(); auto config = cpptoml::parse_file(argv[1]); - auto seq_grp = config.get_table("sequence"); + auto seq_grp = config->get_table("sequence"); if (!seq_grp) throw std::runtime_error{"[sequence] group needed in config file"}; @@ -33,7 +33,7 @@ int main(int argc, char* argv[]) if (!prefix) throw std::runtime_error{"[sequence] group needs a prefix key"}; - auto parser_grp = config.get_table("parser"); + auto parser_grp = config->get_table("parser"); if (!parser_grp) throw std::runtime_error{"[parser] group needed in config file"}; diff --git a/src/sequence/crf/tools/crf-test.cpp b/src/sequence/crf/tools/crf-test.cpp index afab23aa7..cfface956 100644 --- a/src/sequence/crf/tools/crf-test.cpp +++ b/src/sequence/crf/tools/crf-test.cpp @@ -27,14 +27,14 @@ int main(int argc, char** argv) auto config = cpptoml::parse_file(argv[1]); - auto prefix = config.get_as("prefix"); + auto prefix = config->get_as("prefix"); if (!prefix) { LOG(fatal) << "Global configuration must have a prefix key" << ENDLG; return 1; } - auto crf_grp = config.get_table("crf"); + auto crf_grp = config->get_table("crf"); if (!crf_grp) { LOG(fatal) << "Configuration must contain a [crf] group" << ENDLG; diff --git a/src/sequence/crf/tools/crf-train.cpp b/src/sequence/crf/tools/crf-train.cpp index a47f41ebf..17a398603 100644 --- a/src/sequence/crf/tools/crf-train.cpp +++ b/src/sequence/crf/tools/crf-train.cpp @@ -30,14 +30,14 @@ int main(int argc, char** argv) auto config = cpptoml::parse_file(argv[1]); - auto prefix = config.get_as("prefix"); + auto prefix = config->get_as("prefix"); if (!prefix) { LOG(fatal) << "Global configuration must have a prefix key" << ENDLG; return 1; } - auto crf_grp = config.get_table("crf"); + auto crf_grp = config->get_table("crf"); if (!crf_grp) { LOG(fatal) << "Configuration must contain a [crf] group" << ENDLG; diff --git a/src/sequence/crf/tools/pos_tag.cpp b/src/sequence/crf/tools/pos_tag.cpp index 945ffd27d..24acb0123 100644 --- a/src/sequence/crf/tools/pos_tag.cpp +++ b/src/sequence/crf/tools/pos_tag.cpp @@ -25,7 +25,7 @@ int main(int argc, char* argv[]) logging::set_cerr_logging(); auto config = cpptoml::parse_file(argv[1]); - auto crf_group = config.get_table("crf"); + auto crf_group = config->get_table("crf"); if (!crf_group) { std::cerr << "[crf] group needed in config file" << std::endl; diff --git a/src/sequence/crf/tools/pos_tokenizer.cpp b/src/sequence/crf/tools/pos_tokenizer.cpp index 232e3ae62..e1d9d3758 100644 --- a/src/sequence/crf/tools/pos_tokenizer.cpp +++ b/src/sequence/crf/tools/pos_tokenizer.cpp @@ -27,14 +27,14 @@ int main(int argc, char* argv[]) auto config = cpptoml::parse_file(argv[1]); - auto keep_list_filename = config.get_as("function-words"); + auto keep_list_filename = config->get_as("function-words"); std::unordered_set keep_list; std::ifstream keep_list_file{*keep_list_filename}; std::string word; while (keep_list_file >> word) keep_list.insert(word); - auto crf_group = config.get_table("crf"); + auto crf_group = config->get_table("crf"); if (!crf_group) { std::cerr << "[crf] group needed in config file" << std::endl; diff --git a/src/sequence/tools/greedy_tagger_test.cpp b/src/sequence/tools/greedy_tagger_test.cpp index 91b89c08f..26ef45ed7 100644 --- a/src/sequence/tools/greedy_tagger_test.cpp +++ b/src/sequence/tools/greedy_tagger_test.cpp @@ -34,14 +34,14 @@ int main(int argc, char** argv) auto config = cpptoml::parse_file(argv[1]); - auto prefix = config.get_as("prefix"); + auto prefix = config->get_as("prefix"); if (!prefix) { LOG(fatal) << "Global configuration must have a prefix key" << ENDLG; return 1; } - auto seq_grp = config.get_table("sequence"); + auto seq_grp = config->get_table("sequence"); if (!seq_grp) { LOG(fatal) << "Configuration must contain a [sequence] group" << ENDLG; diff --git a/src/sequence/tools/greedy_tagger_train.cpp b/src/sequence/tools/greedy_tagger_train.cpp index 1a8fed7a6..24d22bc94 100644 --- a/src/sequence/tools/greedy_tagger_train.cpp +++ b/src/sequence/tools/greedy_tagger_train.cpp @@ -33,14 +33,14 @@ int main(int argc, char** argv) auto config = cpptoml::parse_file(argv[1]); - auto prefix = config.get_as("prefix"); + auto prefix = config->get_as("prefix"); if (!prefix) { LOG(fatal) << "Global configuration must have a prefix key" << ENDLG; return 1; } - auto seq_grp = config.get_table("sequence"); + auto seq_grp = config->get_table("sequence"); if (!seq_grp) { LOG(fatal) << "Configuration must contain a [sequence] group" << ENDLG; diff --git a/src/test/analyzer_test.cpp b/src/test/analyzer_test.cpp index d94f70d7a..e36a20ed3 100644 --- a/src/test/analyzer_test.cpp +++ b/src/test/analyzer_test.cpp @@ -19,9 +19,8 @@ namespace std::unique_ptr make_filter() { using namespace analyzers; - create_config("line"); - auto config = cpptoml::parse_file("test-config.toml"); - return analyzers::default_filter_chain(config); + auto line_cfg = create_config("line"); + return analyzers::default_filter_chain(*line_cfg); } } diff --git a/src/test/classifier_test.cpp b/src/test/classifier_test.cpp index d5b6babee..bee8f2ae6 100644 --- a/src/test/classifier_test.cpp +++ b/src/test/classifier_test.cpp @@ -51,11 +51,11 @@ int run_tests(const std::string& type) // to delete their directory; this is needed for weirdness on NFS or // other filesystems that might lock opened files { - auto i_idx - = index::make_index("test-config.toml"); + auto cfg = create_config(type); + auto i_idx = index::make_index(*cfg); auto f_idx = index::make_index( - "test-config.toml"); + *cfg); num_failed += testing::run_test("naive-bayes-cv-" + type, [&]() { @@ -168,8 +168,7 @@ int run_tests(const std::string& type) num_failed += testing::run_test( "svm-wrapper-" + type, [&]() { - auto config = cpptoml::parse_file("test-config.toml"); - auto mod_path = config.get_as("libsvm-modules"); + auto mod_path = cfg->get_as("libsvm-modules"); if (!mod_path) throw std::runtime_error{"no path for libsvm-modules"}; svm_wrapper svm{f_idx, *mod_path}; @@ -190,11 +189,11 @@ int run_load_save_tests() // to delete their directory; this is needed for weirdness on NFS or // other filesystems that might lock opened files { - auto i_idx - = index::make_index("test-config.toml"); + auto line_cfg = create_config("line"); + auto i_idx = index::make_index(*line_cfg); auto f_idx = index::make_index( - "test-config.toml"); + *line_cfg); num_failed += testing::run_test( "naive-bayes-save-load", [&]() @@ -215,8 +214,7 @@ int run_load_save_tests() num_failed += testing::run_test( "svm-wrapper-save-load", [&]() { - auto config = cpptoml::parse_file("test-config.toml"); - auto mod_path = config.get_as("libsvm-modules"); + auto mod_path = line_cfg->get_as("libsvm-modules"); if (!mod_path) throw std::runtime_error{"no path for libsvm-modules"}; { @@ -243,9 +241,7 @@ int classifier_tests() { int num_failed = 0; system("rm -rf ceeaus-*"); - create_config("file"); num_failed += run_tests("file"); - create_config("line"); num_failed += run_tests("line"); num_failed += run_load_save_tests(); return num_failed; diff --git a/src/test/features_test.cpp b/src/test/features_test.cpp index e30510af7..865da8fd1 100644 --- a/src/test/features_test.cpp +++ b/src/test/features_test.cpp @@ -19,12 +19,13 @@ namespace template void test_construction(Index& idx, const std::string& id) { - std::ofstream fconfig{"feature-config.toml"}; - fconfig << "[features]\nmethod = \"" << id << "\"\n" - << "prefix = \"test-features\""; - fconfig.close(); - auto config = cpptoml::parse_file("feature-config.toml"); - auto selector = features::make_selector(config, idx); + auto config = cpptoml::make_table(); + auto fcfg = cpptoml::make_table(); + fcfg->insert("method", id); + fcfg->insert("prefix", "test-features"); + config->insert("features", fcfg); + + auto selector = features::make_selector(*config, idx); selector->select(20); selector->select(50); selector->select_percent(0.05); @@ -44,12 +45,11 @@ void test_construction(Index& idx, const std::string& id) int features_tests() { int failed = 0; - create_config("line"); + auto line_cfg = create_config("line"); // scope for forward index object { - auto f_idx = index::make_index( - "test-config.toml"); + auto f_idx = index::make_index(*line_cfg); failed += testing::run_test("chi-square", [&]() { @@ -70,7 +70,6 @@ int features_tests() } system("rm -rf ceeaus-* test-features.*"); - filesystem::delete_file("feature-config.toml"); return failed; } } diff --git a/src/test/forward_index_test.cpp b/src/test/forward_index_test.cpp index 146c28d03..651185ff0 100644 --- a/src/test/forward_index_test.cpp +++ b/src/test/forward_index_test.cpp @@ -10,19 +10,24 @@ namespace meta namespace testing { -void create_libsvm_config() +std::shared_ptr create_libsvm_config() { auto orig_config = cpptoml::parse_file("config.toml"); - std::string config_filename{"test-config.toml"}; - std::ofstream config_file{config_filename}; - config_file << "prefix = \"" << *orig_config.get_as("prefix") - << "\"\n" - << "corpus-type = \"line-corpus\"\n" - << "dataset = \"breast-cancer\"\n" - << "forward-index = \"bcancer-fwd\"\n" - << "inverted-index = \"bcancer-inv\"\n" - << "[[analyzers]]\n" - << "method = \"libsvm\"\n"; + + auto config = cpptoml::make_table(); + config->insert("prefix", *orig_config->get_as("prefix")); + config->insert("corpus-type", "line-corpus"); + config->insert("dataset", "breast-cancer"); + config->insert("forward-index", "bcancer-fwd"); + config->insert("inverted-index", "bcancer-inv"); + + auto anas = cpptoml::make_table_array(); + auto ana = cpptoml::make_table(); + ana->insert("method", "libsvm"); + anas->push_back(ana); + config->insert("analyzers", anas); + + return config; } template @@ -92,89 +97,81 @@ void check_ceeaus_doc_id(Index& idx) { in >> first; in >> second; + std::cout << "Checking term " << first << " (" << idx.term_text(first) + << ")" << std::endl; ASSERT_EQUAL(first, count.first); ASSERT_APPROX_EQUAL(second, count.second); } } -void ceeaus_forward_test() +void ceeaus_forward_test(const cpptoml::table& conf) { auto idx = index::make_index( - "test-config.toml", uint32_t{10000}); + conf, uint32_t{10000}); check_ceeaus_expected_fwd(*idx); check_ceeaus_doc_id(*idx); } -void bcancer_forward_test() +void bcancer_forward_test(const cpptoml::table& conf) { auto idx = index::make_index( - "test-config.toml", uint32_t{10000}); + conf, uint32_t{10000}); check_bcancer_expected(*idx); check_bcancer_doc_id(*idx); } int forward_index_tests() { - create_config("file"); + auto file_cfg = create_config("file"); int num_failed = 0; - num_failed += testing::run_test("forward-index-build-file-corpus", [&]() - { - system("rm -rf ceeaus-*"); - ceeaus_forward_test(); - }); - - num_failed += testing::run_test("forward-index-read-file-corpus", [&]() - { - ceeaus_forward_test(); - }); + // num_failed += testing::run_test("forward-index-build-file-corpus", [&]() + // { + // system("rm -rf ceeaus-*"); + // ceeaus_forward_test(*file_cfg); + // }); - num_failed += testing::run_test("forward-index-build-uninvert", [&]() - { - system("rm -rf ceeaus-*"); + // num_failed += testing::run_test("forward-index-read-file-corpus", [&]() + // { + // ceeaus_forward_test(*file_cfg); + // }); - // hack to inject "uninvert = true" at the top of the config file - auto cfg_contents = filesystem::file_text("test-config.toml"); - cfg_contents = "uninvert = true\n" + cfg_contents; - filesystem::delete_file("test-config.toml"); - { - std::ofstream file{"test-config.toml"}; - file.write(cfg_contents.c_str(), cfg_contents.size()); - } + // num_failed += testing::run_test("forward-index-build-uninvert", [&]() + // { + // system("rm -rf ceeaus-*"); - ceeaus_forward_test(); - }); + // file_cfg->insert("uninvert", true); + // ceeaus_forward_test(*file_cfg); + // }); - filesystem::delete_file("test-config.toml"); - create_config("line"); + auto line_cfg = create_config("line"); num_failed += testing::run_test("forward-index-build-line-corpus", [&]() - { - system("rm -rf ceeaus-*"); - ceeaus_forward_test(); - }); - - num_failed += testing::run_test("forward-index-read-line-corpus", [&]() - { - ceeaus_forward_test(); - system("rm -rf ceeaus-* test-config.toml"); - }); - - - create_libsvm_config(); - - num_failed += testing::run_test("forward-index-build-libsvm", [&]() - { - system("rm -rf bcancer-*"); - bcancer_forward_test(); - }); - - num_failed += testing::run_test("forward-index-load-libsvm", [&]() - { - bcancer_forward_test(); - system("rm -rf bcancer-* test-config.toml"); - }); + { + system("rm -rf ceeaus-*"); + ceeaus_forward_test(*line_cfg); + }); + + // num_failed += testing::run_test("forward-index-read-line-corpus", [&]() + // { + // ceeaus_forward_test(*line_cfg); + // system("rm -rf ceeaus-*"); + // }); + + // auto svm_cfg = create_libsvm_config(); + + // num_failed += testing::run_test("forward-index-build-libsvm", [&]() + // { + // system("rm -rf bcancer-*"); + // bcancer_forward_test(*svm_cfg); + // }); + + // num_failed += testing::run_test("forward-index-load-libsvm", [&]() + // { + // bcancer_forward_test(*svm_cfg); + // system("rm -rf bcancer-*"); + // }); return num_failed; } diff --git a/src/test/inverted_index_test.cpp b/src/test/inverted_index_test.cpp index 28f7476d1..0cca06fe3 100644 --- a/src/test/inverted_index_test.cpp +++ b/src/test/inverted_index_test.cpp @@ -10,52 +10,61 @@ namespace meta namespace testing { -void create_config(const std::string& corpus_type) +std::shared_ptr create_config(const std::string& corpus_type) { auto orig_config = cpptoml::parse_file("config.toml"); - std::string config_filename{"test-config.toml"}; - std::ofstream config_file{config_filename}; + std::ofstream config_file{"test-config.toml"}; - auto stop_words = orig_config.get_as("stop-words"); + auto stop_words = orig_config->get_as("stop-words"); if (!stop_words) throw std::runtime_error{"\"stop-words\" not in config"}; - auto libsvm_modules = orig_config.get_as("libsvm-modules"); + auto libsvm_modules = orig_config->get_as("libsvm-modules"); if (!libsvm_modules) throw std::runtime_error{"\"libsvm-modules\" not in config"}; - auto punctuation = orig_config.get_as("punctuation"); + auto punctuation = orig_config->get_as("punctuation"); if (!punctuation) throw std::runtime_error{"\"punctuation\" not in config"}; - auto start_exeptions = orig_config.get_as("start-exceptions"); + auto start_exeptions = orig_config->get_as("start-exceptions"); if (!start_exeptions) throw std::runtime_error{"\"start-exceptions\" not in config"}; - auto end_exceptions = orig_config.get_as("end-exceptions"); + auto end_exceptions = orig_config->get_as("end-exceptions"); if (!end_exceptions) throw std::runtime_error{"\"end-exceptions\" not in config"}; - config_file << "stop-words = \"" << *stop_words << "\"\n" - << "punctuation = \"" << *punctuation << "\"\n" - << "start-exceptions = \"" << *start_exeptions << "\"\n" - << "end-exceptions = \"" << *end_exceptions << "\"\n" - << "prefix = \"" << *orig_config.get_as("prefix") - << "\"\n" - << "query-judgements = \"../data/ceeaus-qrels.txt\"\n" - << "libsvm-modules = \"" << *libsvm_modules << "\"\n" - << "dataset = \"ceeaus\"\n" - << "corpus = \"" << corpus_type << ".toml\"\n" - << "encoding = \"shift_jis\"\n" - << "forward-index = \"ceeaus-fwd\"\n" - << "inverted-index = \"ceeaus-inv\"\n" - << "[[analyzers]]\n" - << "method = \"ngram-word\"\n" - << "ngram = 1\n" - << "filter = \"default-chain\"\n" - << "[language-model]\n" - << "arpa-file = \"../data/english-sentences.arpa\"\n" - << "binary-file-prefix = \"test-lm-\""; + auto table = cpptoml::make_table(); + table->insert("stop-words", *stop_words); + table->insert("punctuation", *punctuation); + table->insert("start-exceptions", *start_exeptions); + table->insert("end-exceptions", *end_exceptions); + table->insert("prefix", *orig_config->get_as("prefix")); + table->insert("query-judgements", "../data/ceeaus-qrels.txt"); + table->insert("libsvm-modules", *libsvm_modules); + table->insert("dataset", "ceeaus"); + table->insert("corpus", corpus_type + ".toml"); + table->insert("encoding", "shift_jis"); + table->insert("forward-index", "ceeaus-fwd"); + table->insert("inverted-index", "ceeaus-inv"); + + auto anas = cpptoml::make_table_array(); + auto ana = cpptoml::make_table(); + ana->insert("method", "ngram-word"); + ana->insert("ngram", 1); + ana->insert("filter", "default-chain"); + anas->push_back(ana); + + table->insert("analyzers", anas); + + auto lm = cpptoml::make_table(); + lm->insert("arpa-file", "../data/english-sentences.arpa"); + lm->insert("binary-file-prefix", "test-lm-"); + + table->insert("language-model", lm); + + return table; } template @@ -102,15 +111,14 @@ void check_term_id(Index& idx) int inverted_index_tests() { - create_config("file"); + auto file_cfg = create_config("file"); int num_failed = 0; num_failed += testing::run_test( "inverted-index-build-file-corpus", [&]() { system("rm -rf ceeaus-inv"); - auto idx - = index::make_index("test-config.toml"); + auto idx = index::make_index(*file_cfg); check_ceeaus_expected(*idx); }); @@ -118,22 +126,20 @@ int inverted_index_tests() "inverted-index-read-file-corpus", [&]() { { - auto idx = index::make_index( - "test-config.toml"); + auto idx = index::make_index(*file_cfg); check_ceeaus_expected(*idx); check_term_id(*idx); } system("rm -rf ceeaus-inv test-config.toml"); }); - create_config("line"); + auto line_cfg = create_config("line"); system("rm -rf ceeaus-inv"); num_failed += testing::run_test( "inverted-index-build-line-corpus", [&]() { - auto idx - = index::make_index("test-config.toml"); + auto idx = index::make_index(*line_cfg); check_ceeaus_expected(*idx); }); @@ -141,30 +147,28 @@ int inverted_index_tests() "inverted-index-read-line-corpus", [&]() { auto idx = index::make_index( - "test-config.toml", uint32_t{10000}); + caching::splay_cache>(*line_cfg, + uint32_t{10000}); check_ceeaus_expected(*idx); check_term_id(*idx); check_term_id(*idx); // twice to check splay_caching }); #if META_HAS_ZLIB - create_config("gz"); + auto gz_cfg = create_config("gz"); system("rm -rf ceeaus-inv"); num_failed += testing::run_test( "inverted-index-build-gz-corpus", [&]() { - auto idx - = index::make_index("test-config.toml"); + auto idx = index::make_index(*gz_cfg); check_ceeaus_expected(*idx); }); num_failed += testing::run_test( "inverted-index-read-gz-corpus", [&]() { - auto idx - = index::make_index("test-config.toml"); + auto idx = index::make_index(*gz_cfg); check_ceeaus_expected(*idx); check_term_id(*idx); }); @@ -177,7 +181,7 @@ int inverted_index_tests() { auto idx = index::make_index( - "test-config.toml", uint64_t{1000}); + *line_cfg, uint64_t{1000}); check_term_id(*idx); check_term_id(*idx); }); @@ -186,8 +190,7 @@ int inverted_index_tests() "inverted-index-no-evict-cache", [&]() { auto idx = index::make_index( - "test-config.toml"); + caching::no_evict_cache>(*line_cfg); check_term_id(*idx); check_term_id(*idx); }); @@ -197,7 +200,7 @@ int inverted_index_tests() { auto idx = index::make_index( - "test-config.toml", uint8_t{8}); + *line_cfg, uint8_t{8}); check_term_id(*idx); check_term_id(*idx); }); diff --git a/src/test/ir_eval_test.cpp b/src/test/ir_eval_test.cpp index 61f489fc3..33483f9b6 100644 --- a/src/test/ir_eval_test.cpp +++ b/src/test/ir_eval_test.cpp @@ -36,11 +36,11 @@ int ir_eval_bounds() "ir-eval-bounds", [&]() { system("rm -rf ceeaus-inv"); - create_config("file"); + auto file_cfg = create_config("file"); auto idx - = index::make_index("test-config.toml"); + = index::make_index(*file_cfg); index::okapi_bm25 ranker; - index::ir_eval eval{"test-config.toml"}; + index::ir_eval eval{*file_cfg}; // sanity test bounds for (size_t i = 0; i < 5; ++i) { @@ -74,8 +74,8 @@ int ir_eval_results() return testing::run_test( "ir-eval-results", [&]() { - create_config("file"); - index::ir_eval eval{"test-config.toml"}; + auto file_cfg = create_config("file"); + index::ir_eval eval{*file_cfg}; ASSERT_APPROX_EQUAL(eval.map(), 0.0); ASSERT_APPROX_EQUAL(eval.gmap(), 0.0); @@ -136,7 +136,6 @@ int ir_eval_results() check_query(eval, results, qid, 1.0, 1.0, 1.0, 1.0, 1.0); // recall is still not perfect @5 check_query(eval, results, qid, 1.0 / 1.5, 1.0, 0.5, 1.0, 1.0, 5); - system("rm test-config.toml"); }); } diff --git a/src/test/lm_test.cpp b/src/test/lm_test.cpp index 7b27c03d8..ca0b56cb7 100644 --- a/src/test/lm_test.cpp +++ b/src/test/lm_test.cpp @@ -15,11 +15,11 @@ namespace testing int lm_tests() { int num_failed = 0; - create_config("line"); + auto line_cfg = create_config("line"); auto test = [&]() { - lm::language_model model{cpptoml::parse_file("test-config.toml")}; + lm::language_model model{*line_cfg}; lm::sentence s1{ " I disagree with this statement for several reasons . ", false}; diff --git a/src/test/ranker_test.cpp b/src/test/ranker_test.cpp index 89fa221c4..13a5c4a5b 100644 --- a/src/test/ranker_test.cpp +++ b/src/test/ranker_test.cpp @@ -37,45 +37,44 @@ void test_rank(Ranker& r, Index& idx, const std::string& encoding) int ranker_tests() { - create_config("file"); + auto config = create_config("file"); system("rm -rf ceeaus-inv"); - auto idx = index::make_index("test-config.toml"); + auto idx = index::make_index(*config); - auto config = cpptoml::parse_file("test-config.toml"); std::string encoding = "utf-8"; - if (auto enc = config.get_as("encoding")) + if (auto enc = config->get_as("encoding")) encoding = *enc; int num_failed = 0; num_failed += testing::run_test("ranker-absolute-discount", [&]() - { - index::absolute_discount r; - test_rank(r, *idx, encoding); - }); + { + index::absolute_discount r; + test_rank(r, *idx, encoding); + }); num_failed += testing::run_test("ranker-dirichlet-prior", [&]() - { - index::dirichlet_prior r; - test_rank(r, *idx, encoding); - }); + { + index::dirichlet_prior r; + test_rank(r, *idx, encoding); + }); num_failed += testing::run_test("ranker-jelinek-mercer", [&]() - { - index::jelinek_mercer r; - test_rank(r, *idx, encoding); - }); + { + index::jelinek_mercer r; + test_rank(r, *idx, encoding); + }); num_failed += testing::run_test("ranker-okapi-bm25", [&]() - { - index::okapi_bm25 r; - test_rank(r, *idx, encoding); - }); + { + index::okapi_bm25 r; + test_rank(r, *idx, encoding); + }); num_failed += testing::run_test("ranker-pivoted-length", [&]() - { - index::pivoted_length r; - test_rank(r, *idx, encoding); - }); + { + index::pivoted_length r; + test_rank(r, *idx, encoding); + }); idx = nullptr; diff --git a/src/tools/profile.cpp b/src/tools/profile.cpp index e5017243a..9262271e3 100644 --- a/src/tools/profile.cpp +++ b/src/tools/profile.cpp @@ -313,19 +313,19 @@ int main(int argc, char* argv[]) bool all = args.find("--all") != args.end(); if (all || args.find("--stem") != args.end()) - stem(file, config); + stem(file, *config); if (all || args.find("--stop") != args.end()) - stop(file, config); + stop(file, *config); if (all || args.find("--pos") != args.end()) - pos(file, config, false); + pos(file, *config, false); if (all || args.find("--pos-replace") != args.end()) - pos(file, config, true); + pos(file, *config, true); if (all || args.find("--parse") != args.end()) - parse(file, config); + parse(file, *config); if (all || args.find("--freq-unigram") != args.end()) - freq(file, config, 1); + freq(file, *config, 1); if (all || args.find("--freq-bigram") != args.end()) - freq(file, config, 2); + freq(file, *config, 2); if (all || args.find("--freq-trigram") != args.end()) - freq(file, config, 3); + freq(file, *config, 3); } diff --git a/src/tools/top_k.cpp b/src/tools/top_k.cpp index 39b2c3971..fc19e0589 100644 --- a/src/tools/top_k.cpp +++ b/src/tools/top_k.cpp @@ -34,15 +34,15 @@ int main(int argc, char* argv[]) logging::set_cerr_logging(); auto config = cpptoml::parse_file(argv[1]); - auto group = config.get_table_array("analyzers"); + auto group = config->get_table_array("analyzers"); if (!group) throw std::runtime_error{"[[analyzers]] missing from config"}; // only use the feature representation of the first analyzer - auto filts = analyzers::load_filters(config, *(group->get()[0])); + auto filts = analyzers::load_filters(*config, *(group->get()[0])); std::unordered_map counts; - auto docs = corpus::corpus::load(config); + auto docs = corpus::corpus::load(*config); printing::progress prog{" > Reading corpus: ", docs->size()}; while (docs->has_next()) { diff --git a/src/topics/tools/lda-topics.cpp b/src/topics/tools/lda-topics.cpp index 183ab1204..a494ed571 100644 --- a/src/topics/tools/lda-topics.cpp +++ b/src/topics/tools/lda-topics.cpp @@ -21,8 +21,9 @@ int print_usage(const std::string& name) int print_topics(const std::string& config_file, const std::string& filename, size_t num_words) { + auto config = cpptoml::parse_file(config_file); auto idx = index::make_index( - config_file); + *config); std::ifstream file{filename}; while (file) @@ -39,7 +40,9 @@ int print_topics(const std::string& config_file, const std::string& filename, std::vector> pairs; auto comp = [](const std::pair& first, const std::pair& second) - { return first.second > second.second; }; + { + return first.second > second.second; + }; while (stream) { std::string to_split; diff --git a/src/topics/tools/lda.cpp b/src/topics/tools/lda.cpp index 3389de78b..dc90ece62 100644 --- a/src/topics/tools/lda.cpp +++ b/src/topics/tools/lda.cpp @@ -42,14 +42,14 @@ int run_lda(const std::string& config_file) using namespace meta::topics; auto config = cpptoml::parse_file(config_file); - if (!config.contains("lda")) + if (!config->contains("lda")) { std::cerr << "Missing lda configuration group in " << config_file << std::endl; return 1; } - auto lda_group = config.get_table("lda"); + auto lda_group = config->get_table("lda"); if (!check_parameter(config_file, *lda_group, "alpha") || !check_parameter(config_file, *lda_group, "beta") @@ -68,7 +68,7 @@ int run_lda(const std::string& config_file) auto f_idx = index::make_index( - config_file); + *config); if (type == "gibbs") { std::cout << "Beginning LDA using serial Gibbs sampling..." diff --git a/src/topics/tools/topic_corpus.cpp b/src/topics/tools/topic_corpus.cpp index b485e209e..e13c8d96b 100644 --- a/src/topics/tools/topic_corpus.cpp +++ b/src/topics/tools/topic_corpus.cpp @@ -122,7 +122,7 @@ int main(int argc, char* argv[]) auto config = cpptoml::parse_file(argv[1]); - auto ctype = *config.get_as("corpus-type"); + auto ctype = *config->get_as("corpus-type"); if (ctype != "line-corpus") { std::cerr << "Currently only line_corpus format is supported!" @@ -130,8 +130,8 @@ int main(int argc, char* argv[]) return 1; } - auto prefix = *config.get_as("prefix"); - auto dataset = *config.get_as("dataset"); + auto prefix = *config->get_as("prefix"); + auto dataset = *config->get_as("dataset"); std::ifstream thetas{argv[2]}; create_topic_corpus(prefix, dataset, thetas); } From 6e5ad8f87808797448016a124266727860c9e975 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 11 Sep 2015 20:33:20 -0500 Subject: [PATCH 269/481] Don't recompute the filename for chunk emplacement. --- src/index/forward_index.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 0e69b4599..fc213ae27 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -398,8 +398,7 @@ void forward_index::impl::merge_chunks(size_t num_chunks, auto filename = idx_->index_name() + "/chunk-" + std::to_string(i); if (filesystem::file_exists(filename) && filesystem::file_size(filename) > 0) - chunks.emplace_back(idx_->index_name() + "/chunk-" - + std::to_string(i)); + chunks.emplace_back(filename); } printing::progress progress{ From 87f64cb7b5de5102f086ceae17b789b1624d19e1 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 12 Sep 2015 09:54:48 -0500 Subject: [PATCH 270/481] move all exception classes outside their respective class --- include/analyzers/token_stream.h | 15 ++++++------ include/caching/splay_cache.h | 15 ++++++------ include/classify/classifier/knn.h | 15 ++++++------ include/classify/classifier/naive_bayes.h | 12 +++++----- .../classify/classifier/nearest_centroid.h | 15 ++++++------ include/corpus/corpus.h | 18 +++++++-------- include/corpus/metadata.h | 15 +++++++----- include/features/feature_selector.h | 1 + include/features/selector_factory.h | 13 +++++++++-- include/index/eval/ir_eval.h | 21 +++++++++-------- include/index/forward_index.h | 17 +++++++------- include/index/inverted_index.h | 17 +++++++------- include/index/postings_inverter.h | 16 ++++++------- include/index/vocabulary_map_writer.h | 16 ++++++------- include/io/compressed_file_reader.h | 15 ++++++------ include/io/compressed_file_writer.h | 15 ++++++------ include/io/mmap_file.h | 15 ++++++------ include/lm/diff.h | 3 +++ include/lm/sentence.h | 3 +++ include/parser/sr_parser.h | 18 +++++++-------- include/parser/transition_map.h | 19 ++++++++------- include/parser/trees/visitors/binarizer.h | 13 +++++++---- .../parser/trees/visitors/transition_finder.h | 15 +++++++----- include/sequence/observation.h | 18 +++++++-------- include/util/disk_vector.h | 17 +++++++------- include/util/factory.h | 15 ++++++------ include/util/invertible_map.h | 23 +++++++++---------- include/util/persistent_stack.h | 12 +++++----- include/util/persistent_stack.tcc | 4 ++-- src/analyzers/filters/filter_factory.cpp | 10 ++++---- src/analyzers/filters/icu_filter.cpp | 2 +- src/analyzers/filters/length_filter.cpp | 9 ++++---- src/analyzers/filters/list_filter.cpp | 5 ++-- src/analyzers/tokenizers/icu_tokenizer.cpp | 5 ++-- src/classify/classifier/naive_bayes.cpp | 8 ++++--- src/corpus/document.cpp | 3 +-- src/corpus/gz_corpus.cpp | 6 ++--- src/corpus/metadata.cpp | 8 +++---- src/corpus/metadata_parser.cpp | 2 +- src/features/selector_factory.cpp | 6 ++--- src/index/metadata_file.cpp | 4 ++-- src/index/metadata_writer.cpp | 2 +- src/index/tools/query-runner.cpp | 2 +- src/parser/sr_parser.cpp | 6 ++--- src/parser/state.cpp | 8 +++---- src/parser/transition_map.cpp | 9 ++++---- src/parser/trees/visitors/binarizer.cpp | 2 +- .../trees/visitors/transition_finder.cpp | 7 +++--- src/sequence/observation.cpp | 6 ++--- 49 files changed, 268 insertions(+), 253 deletions(-) diff --git a/include/analyzers/token_stream.h b/include/analyzers/token_stream.h index 5fca98d83..c352e0ab8 100644 --- a/include/analyzers/token_stream.h +++ b/include/analyzers/token_stream.h @@ -54,16 +54,15 @@ class token_stream * @return a unique_ptr to copy this object */ virtual std::unique_ptr clone() const = 0; - - /** - * Basic exception class for token stream interactions. - */ - class token_stream_exception : public std::runtime_error - { - using std::runtime_error::runtime_error; - }; }; +/** + * Basic exception class for token stream interactions. + */ +class token_stream_exception : public std::runtime_error +{ + using std::runtime_error::runtime_error; +}; } } #endif diff --git a/include/caching/splay_cache.h b/include/caching/splay_cache.h index 492c5f090..523051c32 100644 --- a/include/caching/splay_cache.h +++ b/include/caching/splay_cache.h @@ -169,16 +169,15 @@ class splay_cache * @param subroot */ void rotate_right(node*& subroot); +}; +/** + * Basic exception for splay_cache interactions. + */ +class splay_cache_exception : public std::runtime_error +{ public: - /** - * Basic exception for splay_cache interactions. - */ - class splay_cache_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using std::runtime_error::runtime_error; }; } } diff --git a/include/classify/classifier/knn.h b/include/classify/classifier/knn.h index 0b2cb51e0..07c84eba6 100644 --- a/include/classify/classifier/knn.h +++ b/include/classify/classifier/knn.h @@ -88,16 +88,15 @@ class knn : public classifier /** Whether we want the neighbors to be weighted by distance or not */ const bool weighted_; +}; +/** + * Basic exception for knn interactions. + */ +class knn_exception : public std::runtime_error +{ public: - /** - * Basic exception for knn interactions. - */ - class knn_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using std::runtime_error::runtime_error; }; /** diff --git a/include/classify/classifier/naive_bayes.h b/include/classify/classifier/naive_bayes.h index 6104a49aa..acd35d8ff 100644 --- a/include/classify/classifier/naive_bayes.h +++ b/include/classify/classifier/naive_bayes.h @@ -82,12 +82,6 @@ class naive_bayes : public classifier */ const static util::string_view id; - class exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; - private: /** * Contains P(term|class) for each class. @@ -100,6 +94,12 @@ class naive_bayes : public classifier stats::multinomial class_probs_; }; +class naive_bayes_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; + /** * Specialization of the factory method used for creating naive bayes * classifiers. diff --git a/include/classify/classifier/nearest_centroid.h b/include/classify/classifier/nearest_centroid.h index 67090fb24..13fdd269e 100644 --- a/include/classify/classifier/nearest_centroid.h +++ b/include/classify/classifier/nearest_centroid.h @@ -74,16 +74,15 @@ class nearest_centroid : public classifier /// The document centroids for this learner std::unordered_map> centroids_; +}; +/** + * Basic exception for nearest_centroid interactions. + */ +class nearest_centroid_exception : public std::runtime_error +{ public: - /** - * Basic exception for nearest_centroid interactions. - */ - class nearest_centroid_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using std::runtime_error::runtime_error; }; /** diff --git a/include/corpus/corpus.h b/include/corpus/corpus.h index 614f39fb1..8b97579d2 100644 --- a/include/corpus/corpus.h +++ b/include/corpus/corpus.h @@ -87,15 +87,6 @@ class corpus static std::unique_ptr load(const std::string& config_file); static std::unique_ptr load(const cpptoml::table& config); - /** - * Basic exception for corpus interactions. - */ - class corpus_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; - protected: /** * Helper function to be used by deriving classes in implementing @@ -111,6 +102,15 @@ class corpus /// The metadata parser util::optional mdata_parser_; }; + +/** + * Basic exception for corpus interactions. + */ +class corpus_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; } } diff --git a/include/corpus/metadata.h b/include/corpus/metadata.h index fbda1cfbb..23f923d9c 100644 --- a/include/corpus/metadata.h +++ b/include/corpus/metadata.h @@ -123,12 +123,6 @@ class metadata return util::nullopt; } - class exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; - /** * Tagged union to represent a single metadata field. */ @@ -320,6 +314,15 @@ class metadata * @return the corresponding metadata::schema object. */ metadata::schema metadata_schema(const cpptoml::table& config); + +/** + * Exception class for metadata operations. + */ +class metadata_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; } } #endif diff --git a/include/features/feature_selector.h b/include/features/feature_selector.h index 5646ee908..7a5bd41cb 100644 --- a/include/features/feature_selector.h +++ b/include/features/feature_selector.h @@ -190,6 +190,7 @@ class feature_selector /// P(c,t) indexed by [label_id][term_id] std::vector> co_occur_; }; + /** * Basic exception for feature selectors. */ diff --git a/include/features/selector_factory.h b/include/features/selector_factory.h index eaa6b7e02..dafd7486c 100644 --- a/include/features/selector_factory.h +++ b/include/features/selector_factory.h @@ -24,6 +24,15 @@ namespace meta namespace features { +/** + * Exception for selector_factory operations. + */ +class selector_factory_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; + /** * Factory that is responsible for creating selectors from configuration * files. Clients should use the register_selector method instead of this @@ -76,11 +85,11 @@ std::unique_ptr { auto prefix = config.get_as("prefix"); if (!prefix) - throw selector_factory::exception{"no prefix in [features] table"}; + throw selector_factory_exception{"no prefix in [features] table"}; auto method = config.get_as("method"); if (!method) - throw selector_factory::exception{ + throw selector_factory_exception{ "feature selection method required in [features] table"}; return make_unique(*prefix + "." + *method, std::move(idx)); diff --git a/include/index/eval/ir_eval.h b/include/index/eval/ir_eval.h index 19e9d048f..58ec56b4d 100644 --- a/include/index/eval/ir_eval.h +++ b/include/index/eval/ir_eval.h @@ -46,7 +46,8 @@ class ir_eval * retrieved~docs} \f$ */ double precision(const result_type& results, query_id q_id, - uint64_t num_docs = std::numeric_limits::max()) const; + uint64_t num_docs + = std::numeric_limits::max()) const; /** * @param results The ranked list of results @@ -56,7 +57,8 @@ class ir_eval * relevant~docs} \f$ */ double recall(const result_type& results, query_id q_id, - uint64_t num_docs = std::numeric_limits::max()) const; + uint64_t num_docs + = std::numeric_limits::max()) const; /** * @param results The ranked list of results @@ -141,16 +143,15 @@ class ir_eval */ double relevant_retrieved(const result_type& results, query_id q_id, uint64_t num_docs) const; +}; +/** + * Basic exception for ir_eval interactions. + */ +class ir_eval_exception : public std::runtime_error +{ public: - /** - * Basic exception for ir_eval interactions. - */ - class ir_eval_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using std::runtime_error::runtime_error; }; } } diff --git a/include/index/forward_index.h b/include/index/forward_index.h index 40fc2ef23..105623ed1 100644 --- a/include/index/forward_index.h +++ b/include/index/forward_index.h @@ -37,6 +37,14 @@ namespace meta { namespace index { +/** + * Basic exception for forward_index interactions. + */ +class forward_index_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; /** * The forward_index stores information on a corpus by doc_ids. Each doc_id key @@ -46,15 +54,6 @@ namespace index class forward_index : public disk_index { public: - /** - * Basic exception for forward_index interactions. - */ - class forward_index_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; - /** * forward_index is a friend of the factory method used to create * it. diff --git a/include/index/inverted_index.h b/include/index/inverted_index.h index 738d827aa..3665d22e2 100644 --- a/include/index/inverted_index.h +++ b/include/index/inverted_index.h @@ -43,6 +43,14 @@ namespace meta { namespace index { +/** + * Basic exception for inverted_index interactions. + */ +class inverted_index_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; /** * The inverted_index class stores information on a corpus indexed by term_ids. @@ -56,15 +64,6 @@ namespace index class inverted_index : public disk_index { public: - /** - * Basic exception for inverted_index interactions. - */ - class inverted_index_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; - using primary_key_type = term_id; using secondary_key_type = doc_id; using postings_data_type = postings_data; diff --git a/include/index/postings_inverter.h b/include/index/postings_inverter.h index 1855045ee..2fda4b2af 100644 --- a/include/index/postings_inverter.h +++ b/include/index/postings_inverter.h @@ -132,14 +132,6 @@ class postings_inverter */ uint64_t unique_primary_keys() const; - /** - * Simple exception class for postings_inverter interactions - */ - class postings_inverter_exception : public std::runtime_error - { - using std::runtime_error::runtime_error; - }; - private: /** * @param pdata The collection of postings_data objects to combine into a @@ -162,6 +154,14 @@ class postings_inverter /// Number of unique primary keys encountered while merging util::optional unique_primary_keys_; }; + +/** + * Simple exception class for postings_inverter interactions + */ +class postings_inverter_exception : public std::runtime_error +{ + using std::runtime_error::runtime_error; +}; } } diff --git a/include/index/vocabulary_map_writer.h b/include/index/vocabulary_map_writer.h index b9aa77098..39d0407fd 100644 --- a/include/index/vocabulary_map_writer.h +++ b/include/index/vocabulary_map_writer.h @@ -86,14 +86,6 @@ class vocabulary_map_writer */ void insert(const std::string& term); - /** - * An exception that can be thrown during the building of the tree. - */ - class vocabulary_map_writer_exception : public std::runtime_error - { - using std::runtime_error::runtime_error; - }; - private: /** * Writes null bytes to fill up the current block. @@ -133,6 +125,14 @@ class vocabulary_map_writer /// Number of written nodes to be "merged" when writing the next level uint64_t written_nodes_; }; + +/** + * An exception that can be thrown during the building of the tree. + */ +class vocabulary_map_writer_exception : public std::runtime_error +{ + using std::runtime_error::runtime_error; +}; } } #endif diff --git a/include/io/compressed_file_reader.h b/include/io/compressed_file_reader.h index b2215021b..13649bd08 100644 --- a/include/io/compressed_file_reader.h +++ b/include/io/compressed_file_reader.h @@ -155,16 +155,15 @@ class compressed_file_reader /// hold the (actual -> compressed id) mapping std::function mapping_; +}; +/** + * Basic exception for compressed_file_reader interactions. + */ +class compressed_file_reader_exception : public std::runtime_error +{ public: - /** - * Basic exception for compressed_file_reader interactions. - */ - class compressed_file_reader_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using std::runtime_error::runtime_error; }; /** diff --git a/include/io/compressed_file_writer.h b/include/io/compressed_file_writer.h index 500356a72..abb998546 100644 --- a/include/io/compressed_file_writer.h +++ b/include/io/compressed_file_writer.h @@ -106,16 +106,15 @@ class compressed_file_writer /// Ensures the file isn't closed more than once bool closed_; +}; +/** + * Basic exception for compressed_file_writer interactions. + */ +class compressed_file_writer_exception : public std::runtime_error +{ public: - /** - * Basic exception for compressed_file_writer interactions. - */ - class compressed_file_writer_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using std::runtime_error::runtime_error; }; /** diff --git a/include/io/mmap_file.h b/include/io/mmap_file.h index 5bff69be2..42a461588 100644 --- a/include/io/mmap_file.h +++ b/include/io/mmap_file.h @@ -87,16 +87,15 @@ class mmap_file /// no copying */ const mmap_file& operator=(const mmap_file& other) = delete; +}; +/** + * Basic exception for mmap_file interactions. + */ +class mmap_file_exception : public std::runtime_error +{ public: - /** - * Basic exception for mmap_file interactions. - */ - class mmap_file_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using std::runtime_error::runtime_error; }; } } diff --git a/include/lm/diff.h b/include/lm/diff.h index 4534ab2c0..d351ee6ac 100644 --- a/include/lm/diff.h +++ b/include/lm/diff.h @@ -199,6 +199,9 @@ class diff bool lm_generate_; }; +/** + * Exception class for diff operations. + */ class diff_exception : public std::runtime_error { using std::runtime_error::runtime_error; diff --git a/include/lm/sentence.h b/include/lm/sentence.h index 4fb231014..1738e77a0 100644 --- a/include/lm/sentence.h +++ b/include/lm/sentence.h @@ -181,6 +181,9 @@ class sentence std::vector weights_; }; +/** + * Exception for sentence operations. + */ class sentence_exception : public std::runtime_error { using std::runtime_error::runtime_error; diff --git a/include/parser/sr_parser.h b/include/parser/sr_parser.h index 3fafa1854..c6b412fe4 100644 --- a/include/parser/sr_parser.h +++ b/include/parser/sr_parser.h @@ -135,15 +135,6 @@ class sr_parser */ void save(const std::string& prefix) const; - /** - * Exception thrown during parser actions - */ - class exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; - /** * Sparse vector representation of a state's features. */ @@ -289,6 +280,15 @@ class sr_parser */ uint64_t beam_size_ = 1; }; + +/** + * Exception thrown during parser actions + */ +class sr_parser_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; } } #endif diff --git a/include/parser/transition_map.h b/include/parser/transition_map.h index 94603f6f4..3479a3957 100644 --- a/include/parser/transition_map.h +++ b/include/parser/transition_map.h @@ -65,17 +65,7 @@ class transition_map */ uint64_t size() const; - /** - * Exception thrown from interactions with the transition_map. - */ - class exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; - private: - /** * Loads the transitions from the given file. * @param store The transitions model input stream @@ -92,6 +82,15 @@ class transition_map */ std::vector transitions_; }; + +/** + * Exception thrown from interactions with the transition_map. + */ +class transition_map_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; } } #endif diff --git a/include/parser/trees/visitors/binarizer.h b/include/parser/trees/visitors/binarizer.h index 4ca33626c..05be0ee23 100644 --- a/include/parser/trees/visitors/binarizer.h +++ b/include/parser/trees/visitors/binarizer.h @@ -27,12 +27,15 @@ class binarizer : public tree_transformer public: std::unique_ptr operator()(const leaf_node&) override; std::unique_ptr operator()(const internal_node&) override; +}; - class exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; +/** + * Simple exception class for tree binarizer operations. + */ +class tree_binarizer_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; }; } } diff --git a/include/parser/trees/visitors/transition_finder.h b/include/parser/trees/visitors/transition_finder.h index d47a0ebea..9ba635753 100644 --- a/include/parser/trees/visitors/transition_finder.h +++ b/include/parser/trees/visitors/transition_finder.h @@ -41,18 +41,21 @@ class transition_finder : public const_visitor */ std::vector transitions(); - class exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; - private: /** * Storage for the transitions. */ std::vector transitions_; }; + +/** + * Basic exception for transition finder operations. + */ +class transition_finder_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; } } diff --git a/include/sequence/observation.h b/include/sequence/observation.h index 18fdecddc..04595d3c6 100644 --- a/include/sequence/observation.h +++ b/include/sequence/observation.h @@ -98,15 +98,6 @@ class observation */ void features(feature_vector feats); - /** - * Basic exception class for observation interactions. - */ - class exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; - private: /// The symbol for this observation symbol_t symbol_; @@ -117,6 +108,15 @@ class observation /// The features for this observation feature_vector features_; }; + +/** + * Basic exception class for observation interactions. + */ +class observation_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; } } #endif diff --git a/include/util/disk_vector.h b/include/util/disk_vector.h index 890173f5e..90c9d02d2 100644 --- a/include/util/disk_vector.h +++ b/include/util/disk_vector.h @@ -35,7 +35,7 @@ template class disk_vector { static_assert(std::is_integral::value || std::is_floating_point::value - || std::is_base_of::value, + || std::is_base_of::value, "disk_vector templated types must be integral types"); public: @@ -250,16 +250,15 @@ class disk_vector /// the file descriptor used to open and close the mmap file int file_desc_; +}; +/** + * Basic exception for disk_vector. + */ +class disk_vector_exception : public std::runtime_error +{ public: - /** - * Basic exception for disk_vector. - */ - class disk_vector_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using std::runtime_error::runtime_error; }; } } diff --git a/include/util/factory.h b/include/util/factory.h index b6d66853e..43e26d61f 100644 --- a/include/util/factory.h +++ b/include/util/factory.h @@ -21,6 +21,13 @@ namespace meta namespace util { +/** Simple exception for factories */ +class factory_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; + /** * Generic factory that can be subclassed to create factories for specific * types. @@ -35,13 +42,7 @@ class factory using pointer = std::unique_ptr; /// Convenience typedef for the factory methods used to create objects using factory_method = std::function; - - /** Simple exception for factories */ - class exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using exception = factory_exception; /** * Obtains the singleton. diff --git a/include/util/invertible_map.h b/include/util/invertible_map.h index 01e74db08..ed70f202f 100644 --- a/include/util/invertible_map.h +++ b/include/util/invertible_map.h @@ -91,16 +91,16 @@ class invertible_map /** * The "inner" iterator representation of the invertible_map. */ - typedef typename std::unordered_map - ::const_iterator InnerIterator; + typedef + typename std::unordered_map::const_iterator InnerIterator; /** * The invertible_map iterator is really just a wrapper for the forward * (key -> value) unordered_map iterator. Use this iterator class the * same way you'd use it on an unordered_map. */ - class Iterator : public std::iterator - + class Iterator + : public std::iterator { private: /// The iterator of the underlying unordered_map @@ -207,16 +207,15 @@ class invertible_map /// The internal map representing Value -> Key pairs std::unordered_map backward_; +}; +/** + * Basic exception for invertible_map interactions. + */ +class invertible_map_exception : public std::runtime_error +{ public: - /** - * Basic exception for invertible_map interactions. - */ - class invertible_map_exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; + using std::runtime_error::runtime_error; }; } } diff --git a/include/util/persistent_stack.h b/include/util/persistent_stack.h index 88732ed14..afc4821b4 100644 --- a/include/util/persistent_stack.h +++ b/include/util/persistent_stack.h @@ -31,12 +31,6 @@ class persistent_stack uint64_t size() const; - class exception : public std::runtime_error - { - public: - using std::runtime_error::runtime_error; - }; - private: struct node { @@ -51,6 +45,12 @@ class persistent_stack std::shared_ptr head_; uint64_t size_; }; + +class persistent_stack_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; } } diff --git a/include/util/persistent_stack.tcc b/include/util/persistent_stack.tcc index d838b9e95..4114762ec 100644 --- a/include/util/persistent_stack.tcc +++ b/include/util/persistent_stack.tcc @@ -35,7 +35,7 @@ template persistent_stack persistent_stack::pop() const { if (size_ == 0) - throw exception{"pop() called on empty stack"}; + throw persistent_stack_exception{"pop() called on empty stack"}; return {head_->prev, size_ - 1}; } @@ -44,7 +44,7 @@ template const T& persistent_stack::peek() const { if (!head_) - throw exception{"peek() called on empty stack"}; + throw persistent_stack_exception{"peek() called on empty stack"}; return head_->data; } diff --git a/src/analyzers/filters/filter_factory.cpp b/src/analyzers/filters/filter_factory.cpp index fbb7db5ec..3d5389b25 100644 --- a/src/analyzers/filters/filter_factory.cpp +++ b/src/analyzers/filters/filter_factory.cpp @@ -31,11 +31,11 @@ void filter_factory::register_tokenizer() add(Tokenizer::id, [](std::unique_ptr source, const cpptoml::table& config) { - if (source) - throw typename Tokenizer::token_stream_exception{ - "tokenizers must be the first filter"}; - return tokenizers::make_tokenizer(config); - }); + if (source) + throw token_stream_exception{ + "tokenizers must be the first filter"}; + return tokenizers::make_tokenizer(config); + }); } template diff --git a/src/analyzers/filters/icu_filter.cpp b/src/analyzers/filters/icu_filter.cpp index b76beb335..c34fe7987 100644 --- a/src/analyzers/filters/icu_filter.cpp +++ b/src/analyzers/filters/icu_filter.cpp @@ -75,7 +75,7 @@ std::unique_ptr { if (auto id = config.get_as("id")) return make_unique(std::move(src), *id); - throw token_stream::token_stream_exception{ + throw token_stream_exception{ "icu_filter requires id to be specified in config"}; } } diff --git a/src/analyzers/filters/length_filter.cpp b/src/analyzers/filters/length_filter.cpp index 640a696a1..9e8630e40 100644 --- a/src/analyzers/filters/length_filter.cpp +++ b/src/analyzers/filters/length_filter.cpp @@ -20,9 +20,9 @@ length_filter::length_filter(std::unique_ptr source, uint64_t min, uint64_t max) : source_{std::move(source)}, min_length_{min}, max_length_{max} { - using exception = token_stream::token_stream_exception; if (min_length_ > max_length_) - throw exception{"min filter length is greater than max filter length"}; + throw token_stream_exception{ + "min filter length is greater than max filter length"}; next_token(); } @@ -86,13 +86,12 @@ std::unique_ptr make_filter(std::unique_ptr src, const cpptoml::table& config) { - using exception = token_stream::token_stream_exception; auto min = config.get_as("min"); if (!min) - throw exception{"min required for length filter config"}; + throw token_stream_exception{"min required for length filter config"}; auto max = config.get_as("max"); if (!max) - throw exception{"max required for length filter config"}; + throw token_stream_exception{"max required for length filter config"}; return make_unique(std::move(src), static_cast(*min), static_cast(*max)); diff --git a/src/analyzers/filters/list_filter.cpp b/src/analyzers/filters/list_filter.cpp index 5abea85ed..ce9a003e4 100644 --- a/src/analyzers/filters/list_filter.cpp +++ b/src/analyzers/filters/list_filter.cpp @@ -99,11 +99,10 @@ std::unique_ptr make_filter(std::unique_ptr src, const cpptoml::table& config) { - using exception = token_stream::token_stream_exception; auto method = config.get_as("method"); auto file = config.get_as("file"); if (!file) - throw exception{"file required for list_filter config"}; + throw token_stream_exception{"file required for list_filter config"}; list_filter::type type = list_filter::type::REJECT; if (method) @@ -111,7 +110,7 @@ std::unique_ptr if (*method == "accept") type = list_filter::type::ACCEPT; else if (*method != "reject") - throw exception{"invalid method for list_filter"}; + throw token_stream_exception{"invalid method for list_filter"}; } return make_unique(std::move(src), *file, type); diff --git a/src/analyzers/tokenizers/icu_tokenizer.cpp b/src/analyzers/tokenizers/icu_tokenizer.cpp index bad28048e..da06b14ab 100644 --- a/src/analyzers/tokenizers/icu_tokenizer.cpp +++ b/src/analyzers/tokenizers/icu_tokenizer.cpp @@ -154,10 +154,9 @@ std::unique_ptr auto country = config.get_as("country"); bool suppress_tags = config.get_as("suppress-tags").value_or(false); - using exception = token_stream::token_stream_exception; - if (country && !language) - throw exception{"icu_tokenizer cannot be created with just a country"}; + throw token_stream_exception{ + "icu_tokenizer cannot be created with just a country"}; if (language) { diff --git a/src/classify/classifier/naive_bayes.cpp b/src/classify/classifier/naive_bayes.cpp index d7dd032a7..6e0188c7a 100644 --- a/src/classify/classifier/naive_bayes.cpp +++ b/src/classify/classifier/naive_bayes.cpp @@ -121,15 +121,17 @@ void naive_bayes::load(const std::string& prefix) #endif if (!tp_in) - throw exception{"term probability file not found at prefix " + prefix}; + throw naive_bayes_exception{"term probability file not found at prefix " + + prefix}; if (!cp_in) - throw exception{"class probability file not found at prefix " + prefix}; + throw naive_bayes_exception{ + "class probability file not found at prefix " + prefix}; uint64_t size; auto bytes = io::packed::read(tp_in, size); if (bytes == 0) - throw exception{ + throw naive_bayes_exception{ "failed reading term probability file (no size written)"}; term_probs_.clear(); diff --git a/src/corpus/document.cpp b/src/corpus/document.cpp index e982b3d0b..a66200c42 100644 --- a/src/corpus/document.cpp +++ b/src/corpus/document.cpp @@ -39,8 +39,7 @@ const std::string& document::content() const { if (content_) return *content_; - throw corpus::corpus_exception{ - "there is no content for the requested document"}; + throw corpus_exception{"there is no content for the requested document"}; } const std::string& document::encoding() const diff --git a/src/corpus/gz_corpus.cpp b/src/corpus/gz_corpus.cpp index 49f4f5651..24312df02 100644 --- a/src/corpus/gz_corpus.cpp +++ b/src/corpus/gz_corpus.cpp @@ -18,7 +18,7 @@ gz_corpus::gz_corpus(const std::string& file, std::string encoding) class_stream_{file + ".labels.gz"} { if (!filesystem::file_exists(file + ".numdocs")) - throw corpus::corpus_exception{ + throw corpus_exception{ file + ".numdocs file does not exist (required for gz_corpus)"}; try @@ -27,8 +27,8 @@ gz_corpus::gz_corpus(const std::string& file, std::string encoding) } catch (const std::exception& ex) { - throw corpus::corpus_exception{"Malformed numdocs file " + file - + ".numdocs: " + ex.what()}; + throw corpus_exception{"Malformed numdocs file " + file + ".numdocs: " + + ex.what()}; } } diff --git a/src/corpus/metadata.cpp b/src/corpus/metadata.cpp index 86557385c..f0b23c0ca 100644 --- a/src/corpus/metadata.cpp +++ b/src/corpus/metadata.cpp @@ -23,10 +23,10 @@ metadata::schema metadata_schema(const cpptoml::table& config) auto type = table->get_as("type"); if (!name) - throw metadata::exception{"name needed for metadata field"}; + throw metadata_exception{"name needed for metadata field"}; if (!type) - throw metadata::exception{"type needed for metadata field"}; + throw metadata_exception{"type needed for metadata field"}; metadata::field_type ftype; if (*type == "int") @@ -47,8 +47,8 @@ metadata::schema metadata_schema(const cpptoml::table& config) } else { - throw metadata::exception{"invalid metadata type: \"" + *type - + "\""}; + throw metadata_exception{"invalid metadata type: \"" + *type + + "\""}; } schema.emplace_back(*name, ftype); } diff --git a/src/corpus/metadata_parser.cpp b/src/corpus/metadata_parser.cpp index f06907de7..f90c67226 100644 --- a/src/corpus/metadata_parser.cpp +++ b/src/corpus/metadata_parser.cpp @@ -28,7 +28,7 @@ std::vector metadata_parser::next() for (const auto& finfo : schema_) { if (!parser_->has_next()) - throw metadata::exception{ + throw metadata_exception{ "metadata input file ended prematurely"}; auto str = parser_->next(); diff --git a/src/features/selector_factory.cpp b/src/features/selector_factory.cpp index 6edf78841..b68effd0d 100644 --- a/src/features/selector_factory.cpp +++ b/src/features/selector_factory.cpp @@ -33,16 +33,16 @@ std::unique_ptr { auto table = config.get_table("features"); if (!table) - throw selector_factory::exception{ + throw selector_factory_exception{ "[features] table missing from config file"}; auto prefix = table->get_as("prefix"); if (!prefix) - throw selector_factory::exception{"no prefix in [features] table"}; + throw selector_factory_exception{"no prefix in [features] table"}; auto method = table->get_as("method"); if (!method) - throw selector_factory::exception{ + throw selector_factory_exception{ "feature selection method required in [features] table"}; uint64_t features_per_class = 20; diff --git a/src/index/metadata_file.cpp b/src/index/metadata_file.cpp index c80abd4e2..acafc2687 100644 --- a/src/index/metadata_file.cpp +++ b/src/index/metadata_file.cpp @@ -24,7 +24,7 @@ struct char_input_stream char get() { if (input_ == end_) - throw corpus::metadata::exception{ + throw corpus::metadata_exception{ "seeking past end of metadata file"}; return *input_++; @@ -59,7 +59,7 @@ metadata_file::metadata_file(const std::string& prefix) corpus::metadata metadata_file::get(doc_id d_id) const { if (d_id >= index_.size()) - throw corpus::metadata::exception{ + throw corpus::metadata_exception{ "invalid doc id in metadata retrieval"}; uint64_t seek_pos = index_[d_id]; diff --git a/src/index/metadata_writer.cpp b/src/index/metadata_writer.cpp index 4be4dd1f7..dd53893af 100644 --- a/src/index/metadata_writer.cpp +++ b/src/index/metadata_writer.cpp @@ -49,7 +49,7 @@ void metadata_writer::write(doc_id d_id, uint64_t length, uint64_t num_unique, // write optional metadata if (mdata.size() != schema_.size()) - throw corpus::metadata::exception{ + throw corpus::metadata_exception{ "schema mismatch when writing metadata"}; for (const auto& fld : mdata) diff --git a/src/index/tools/query-runner.cpp b/src/index/tools/query-runner.cpp index da23ccb47..7b5feb427 100644 --- a/src/index/tools/query-runner.cpp +++ b/src/index/tools/query-runner.cpp @@ -59,7 +59,7 @@ int main(int argc, char* argv[]) { eval = make_unique(*config); } - catch (index::ir_eval::ir_eval_exception& ex) + catch (index::ir_eval_exception& ex) { LOG(info) << "Could not find relevance judgements; skipping eval" << ENDLG; diff --git a/src/parser/sr_parser.cpp b/src/parser/sr_parser.cpp index 7dd2378d1..b8865daa6 100644 --- a/src/parser/sr_parser.cpp +++ b/src/parser/sr_parser.cpp @@ -126,7 +126,7 @@ parse_tree sr_parser::parse(const sequence::sequence& sentence) const } if (new_agenda.size() == 0) - throw exception{"unparsable"}; + throw sr_parser_exception{"unparsable"}; agenda = std::move(new_agenda); } @@ -257,7 +257,7 @@ std::pair sr_parser::train_instance( return train_beam_search(tree, transitions, options, update); default: - throw exception{"Not yet implemented"}; + throw sr_parser_exception{"Not yet implemented"}; } } @@ -466,7 +466,7 @@ void sr_parser::load(const std::string& prefix) void sr_parser::load(std::istream& model) { if (!model) - throw exception{"model file not found"}; + throw sr_parser_exception{"model file not found"}; io::read_binary(model, beam_size_); model_.load(model); diff --git a/src/parser/state.cpp b/src/parser/state.cpp index 96d5c0d22..5fd42df1d 100644 --- a/src/parser/state.cpp +++ b/src/parser/state.cpp @@ -33,7 +33,7 @@ state::state(const sequence::sequence& sentence) for (const auto& obs : sentence) { if (!obs.tagged()) - throw sr_parser::exception{"sentence must be POS tagged"}; + throw sr_parser_exception{"sentence must be POS tagged"}; std::string word = obs.symbol(); class_label tag{obs.tag()}; @@ -113,7 +113,7 @@ state state::advance(const transition& trans) const default: { - throw sr_parser::exception{"Unreachable"}; + throw sr_parser_exception{"Unreachable"}; } } } @@ -305,7 +305,7 @@ bool state::legal(const transition& trans) const return idle_legal(*this); default: - throw sr_parser::exception{"Unreachable"}; + throw sr_parser_exception{"Unreachable"}; } } @@ -344,7 +344,7 @@ transition state::emergency_transition() const } } - throw sr_parser::exception{"emergency transition impossible"}; + throw sr_parser_exception{"emergency transition impossible"}; } size_t state::stack_size() const diff --git a/src/parser/transition_map.cpp b/src/parser/transition_map.cpp index ad406d38c..8f70129e4 100644 --- a/src/parser/transition_map.cpp +++ b/src/parser/transition_map.cpp @@ -36,20 +36,21 @@ transition_map::transition_map(const std::string& prefix) void transition_map::load(std::istream& store) { if (!store) - throw exception{"missing transitions model file"}; + throw transition_map_exception{"missing transitions model file"}; uint64_t num_trans; io::read_binary(store, num_trans); if (!store) - throw exception{"malformed transitions model file"}; + throw transition_map_exception{"malformed transitions model file"}; transitions_.reserve(num_trans); for (uint64_t i = 0; i < num_trans; ++i) { if (!store) - throw exception{"malformed transition model file (too few " - "transitions written)"}; + throw transition_map_exception{ + "malformed transition model file (too few " + "transitions written)"}; transition::type_t trans_type; io::read_binary(store, trans_type); diff --git a/src/parser/trees/visitors/binarizer.cpp b/src/parser/trees/visitors/binarizer.cpp index 1c2865234..cdea0953d 100644 --- a/src/parser/trees/visitors/binarizer.cpp +++ b/src/parser/trees/visitors/binarizer.cpp @@ -71,7 +71,7 @@ std::unique_ptr binarizer::operator()(const internal_node& in) // locate head node auto head = in.head_constituent(); if (!head) - throw exception{"Head constituent not labeled"}; + throw tree_binarizer_exception{"Head constituent not labeled"}; uint64_t head_idx = 0; for (uint64_t idx = 0; idx < in.num_children(); ++idx) diff --git a/src/parser/trees/visitors/transition_finder.cpp b/src/parser/trees/visitors/transition_finder.cpp index ca744e7e3..73f8e2987 100644 --- a/src/parser/trees/visitors/transition_finder.cpp +++ b/src/parser/trees/visitors/transition_finder.cpp @@ -20,7 +20,7 @@ void transition_finder::operator()(const leaf_node&) void transition_finder::operator()(const internal_node& in) { if (in.num_children() > 2) - throw exception{ + throw transition_finder_exception{ "Trees must be binarized before transitions are generated"}; in.each_child([&](const node* n) @@ -42,8 +42,9 @@ void transition_finder::operator()(const internal_node& in) } else { - throw exception{"Incorrect head annotations (head was neither left nor " - "right child)"}; + throw transition_finder_exception{ + "Incorrect head annotations (head was neither left nor " + "right child)"}; } } diff --git a/src/sequence/observation.cpp b/src/sequence/observation.cpp index 0418510a9..01bc70b68 100644 --- a/src/sequence/observation.cpp +++ b/src/sequence/observation.cpp @@ -31,14 +31,14 @@ const tag_t& observation::tag() const { if (tag_) return *tag_; - throw exception{"no tag for this observation"}; + throw observation_exception{"no tag for this observation"}; } const label_id& observation::label() const { if (label_) return *label_; - throw exception{"no label for this observation"}; + throw observation_exception{"no label for this observation"}; } void observation::symbol(symbol_t sym) @@ -61,7 +61,7 @@ bool observation::tagged() const return static_cast(tag_); } -auto observation::features() const -> const feature_vector& +auto observation::features() const -> const feature_vector & { return features_; } From bf978079d861d875ad2a1bf81f01e9654a316554 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 12 Sep 2015 10:44:19 -0500 Subject: [PATCH 271/481] explicitly check for blank lines in file_corpus to improve error message see https://forum.meta-toolkit.org/t/problem-indexing-a-new-file-corpus-based-on-wt2g --- src/corpus/file_corpus.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/corpus/file_corpus.cpp b/src/corpus/file_corpus.cpp index 9c34d58c1..569b56a75 100644 --- a/src/corpus/file_corpus.cpp +++ b/src/corpus/file_corpus.cpp @@ -22,6 +22,9 @@ file_corpus::file_corpus(const std::string& prefix, const std::string& doc_list, while (psr.has_next()) { std::string line = psr.next(); + if (line.empty()) + throw corpus_exception{"empty line in corpus list: line #" + + std::to_string(idx + 1)}; size_t space = line.find_first_of(" "); if (space != std::string::npos) { @@ -34,6 +37,7 @@ file_corpus::file_corpus(const std::string& prefix, const std::string& doc_list, throw corpus_exception{"document list needs class label prefix " "(add [none] if there are no labels)"}; } + ++idx; } } From 29aeca8d34236ececdcccb66cd10c2cb835abbd5 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 12 Sep 2015 10:56:07 -0500 Subject: [PATCH 272/481] let line_corpus use the .numdocs file if it exists --- src/corpus/line_corpus.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/corpus/line_corpus.cpp b/src/corpus/line_corpus.cpp index 1fccffc65..790ee9923 100644 --- a/src/corpus/line_corpus.cpp +++ b/src/corpus/line_corpus.cpp @@ -30,8 +30,21 @@ line_corpus::line_corpus(const std::string& file, std::string encoding, num_lines_ = filesystem::num_lines(file + ".labels"); } + if (num_lines_ == 0 && filesystem::file_exists(file + ".numdocs")) + { + try + { + num_lines_ = std::stoul(filesystem::file_text(file + ".numdocs")); + } + catch (const std::exception& ex) + { + throw corpus_exception{"Malformed numdocs file " + file + + ".numdocs: " + ex.what()}; + } + } + // if we couldn't determine the number of lines in the constructor and the - // two optional files don't exist, we have to count newlines here + // optional files don't exist, we have to count newlines here if (num_lines_ == 0) num_lines_ = filesystem::num_lines(file); } From d2c5ed207ad5d1c17329e4a80ab5c213f991584a Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 12 Sep 2015 11:05:48 -0500 Subject: [PATCH 273/481] use cpptoml's value_or in diff and selector_factory --- src/features/selector_factory.cpp | 6 ++---- src/lm/diff.cpp | 24 +++++++----------------- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/src/features/selector_factory.cpp b/src/features/selector_factory.cpp index b68effd0d..2532aa94d 100644 --- a/src/features/selector_factory.cpp +++ b/src/features/selector_factory.cpp @@ -45,10 +45,8 @@ std::unique_ptr throw selector_factory_exception{ "feature selection method required in [features] table"}; - uint64_t features_per_class = 20; - auto num_features = table->get_as("features-per-class"); - if (num_features) - features_per_class = *num_features; + uint64_t features_per_class + = table->get_as("features-per-class").value_or(20); auto selector = selector_factory::get().create(*method, *table, std::move(idx)); diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index cd6155362..3eb847692 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -30,28 +30,18 @@ diff::diff(const cpptoml::table& config) : lm_{config} throw diff_exception{"max-edits not specified in config"}; max_edits_ = *edits; - auto b_pen = table->get_as("base-penalty"); - base_penalty_ = b_pen ? *b_pen : 0.0; - - auto i_pen = table->get_as("insert-penalty"); - insert_penalty_ = i_pen ? *i_pen : 0.0; - - auto s_pen = table->get_as("substitute-penalty"); - substitute_penalty_ = s_pen ? *s_pen : 0.0; - - auto r_pen = table->get_as("remove-penalty"); - remove_penalty_ = r_pen ? *r_pen : 0.0; - - auto max_cand = table->get_as("max-candidates"); - max_cand_size_ = max_cand ? *max_cand : 20; - auto lambda = table->get_as("lambda"); lambda_ = lambda ? *lambda : 0.5; if (lambda_ < 0.0 || lambda_ > 1.0) throw diff_exception{"lambda value has to be on [0,1]"}; - auto lm_gen = table->get_as("lm-generate"); - lm_generate_ = lm_gen ? *lm_gen : false; + base_penalty_ = table->get_as("base-penalty").value_or(0.0); + insert_penalty_ = table->get_as("insert-penalty").value_or(0.0); + substitute_penalty_ + = table->get_as("substitute-penalty").value_or(0.0); + remove_penalty_ = table->get_as("remove-penalty").value_or(0.0); + max_cand_size_ = table->get_as("max-candidates").value_or(20); + lm_generate_ = table->get_as("lm-generate").value_or(false); set_stems(*table); set_function_words(*table); From 9eb2aee8ffd8d20fbe943e471231b0fb75a507dd Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 12 Sep 2015 11:46:37 -0500 Subject: [PATCH 274/481] document classifiers re: #91 --- include/classify/classifier/dual_perceptron.h | 26 +++++++++++++++++++ include/classify/classifier/knn.h | 16 ++++++++++++ .../classify/classifier/logistic_regression.h | 17 ++++++++++++ include/classify/classifier/naive_bayes.h | 9 +++++++ .../classify/classifier/nearest_centroid.h | 3 +++ include/classify/classifier/one_vs_all.h | 10 +++++++ include/classify/classifier/one_vs_one.h | 10 +++++++ include/classify/classifier/sgd.h | 18 +++++++++++++ include/classify/classifier/svm_wrapper.h | 13 ++++++++++ include/classify/classifier/winnow.h | 11 ++++++++ 10 files changed, 133 insertions(+) diff --git a/include/classify/classifier/dual_perceptron.h b/include/classify/classifier/dual_perceptron.h index 3c8c0d8ac..3dae0c830 100644 --- a/include/classify/classifier/dual_perceptron.h +++ b/include/classify/classifier/dual_perceptron.h @@ -24,6 +24,32 @@ namespace classify * Implements a perceptron classifier, but using the dual formulation of * the problem. This allows the perceptron to be used for data that is not * necessarily linearly separable via the use of a kernel function. + * + * Required config parameters: + * ~~~toml + * [classifier] + * method = "dual-perceptron" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * [classifier] + * alpha = 0.1 + * gamma = 0.05 + * bias = 0.0 + * + * # kernels (optional, but if used they have required params) + * kernel = "polynomial" + * + * # or + * kernel = "rbf" + * rbf-gamma = 0.1 # value required + * + * # or + * kernel = "sigmoid" + * sigmoid-alpha = 0.1 # value required + * sigmoid-c = 0.1 # value required + * ~~~ */ class dual_perceptron : public classifier { diff --git a/include/classify/classifier/knn.h b/include/classify/classifier/knn.h index 07c84eba6..74cc00802 100644 --- a/include/classify/classifier/knn.h +++ b/include/classify/classifier/knn.h @@ -23,6 +23,22 @@ namespace classify /** * Implements the k-Nearest Neighbor lazy learning classification algorithm. + * + * Required config parameters: + * ~~~toml + * [classifier] + * method = "knn" + * k = 10 + * + * [classifier.ranker] + * method = "dirichlet-prior" # any ranker id + * ~~~ + * + * Optional config parameters: + * ~~~toml + * [classifier] + * weighted = true # default is false + * ~~~ */ class knn : public classifier { diff --git a/include/classify/classifier/logistic_regression.h b/include/classify/classifier/logistic_regression.h index 959db3739..e9ab9f171 100644 --- a/include/classify/classifier/logistic_regression.h +++ b/include/classify/classifier/logistic_regression.h @@ -48,6 +48,23 @@ namespace classify * The individual class probabilities may be recovered by using the * `predict` function: this returns an `unordered_map` of `class_label` to * probability. + * + * Required config parameters: + * ~~~toml + * [classifier] + * method = "logistic-regression" + * prefix = "path-to-model" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * [classifier] + * alpha = 0.001 + * gamma = 1e-6 + * bias = 1.0 + * lambda = 0.0001 + * max-iter = 50 + * ~~~ */ class logistic_regression : public classifier { diff --git a/include/classify/classifier/naive_bayes.h b/include/classify/classifier/naive_bayes.h index acd35d8ff..df010e8bd 100644 --- a/include/classify/classifier/naive_bayes.h +++ b/include/classify/classifier/naive_bayes.h @@ -25,6 +25,15 @@ namespace classify /** * Implements the Naive Bayes classifier, a simplistic probabilistic classifier * that uses Bayes' theorem with strong feature independence assumptions. + * + * Required config parameters: none. + * Optional config parameters: + * ~~~toml + * [classifier] + * method = "naive-bayes" + * alpha = 0.1 + * beta = 0.1 + * ~~~ */ class naive_bayes : public classifier { diff --git a/include/classify/classifier/nearest_centroid.h b/include/classify/classifier/nearest_centroid.h index 13fdd269e..a98d8d6cf 100644 --- a/include/classify/classifier/nearest_centroid.h +++ b/include/classify/classifier/nearest_centroid.h @@ -27,6 +27,9 @@ namespace classify * centroid they query is closest to is returned. * @see Centroid-Based Document Classification: Analysis and Experimental * Results, Eui-Hong Han and George Karypis, 2000 + * + * Required config parameters: none + * Optional config parameters: none */ class nearest_centroid : public classifier { diff --git a/include/classify/classifier/one_vs_all.h b/include/classify/classifier/one_vs_all.h index 8db8b7a90..e10a02bf2 100644 --- a/include/classify/classifier/one_vs_all.h +++ b/include/classify/classifier/one_vs_all.h @@ -21,6 +21,16 @@ namespace classify /** * Generalizes binary classifiers to operate over multiclass types using the * one vs all method. + * + * Required config parameters: + * ~~~toml + * [classifier] + * method = "one-vs-all" + * [classifier.base] + * method = "sgd" # for example + * loss = "hinge" # for example + * prefix = "sgd-model" # for example + * ~~~ */ class one_vs_all : public classifier { diff --git a/include/classify/classifier/one_vs_one.h b/include/classify/classifier/one_vs_one.h index 8ba783fd4..4136a744a 100644 --- a/include/classify/classifier/one_vs_one.h +++ b/include/classify/classifier/one_vs_one.h @@ -24,6 +24,16 @@ namespace classify * entails creating a classifier for each pair of classes, and assigning * the label which gets the most "votes" from each individual * binary_classifier as the label for a given document. + * + * Required config parameters: + * ~~~toml + * [classifier] + * method = "one-vs-one" + * [classifier.base] + * method = "sgd" # for example + * loss = "hinge" # for example + * prefix = "sgd-model" # for example + * ~~~ */ class one_vs_one : public classifier { diff --git a/include/classify/classifier/sgd.h b/include/classify/classifier/sgd.h index e406869c3..c0774a3c3 100644 --- a/include/classify/classifier/sgd.h +++ b/include/classify/classifier/sgd.h @@ -24,6 +24,24 @@ namespace classify * Implements stochastic gradient descent for learning binary linear * classifiers. These may be extended to multiclass classification using * the one_vs_all or all_vs_all adapters. + * + * Required config parameters: + * ~~~toml + * [classifier] + * method = "sgd" + * prefix = "path-to-model" + * loss = "hinge" # or "huber", "least-squares", "logistic", etc + * ~~~ + * + * Optional config parameters: + * ~~~toml + * [classifier] + * alpha = 0.001 + * gamma = 1e-6 + * bias = 1.0 + * lambda = 0.0001 + * max-iter = 50 + * ~~~ */ class sgd : public binary_classifier { diff --git a/include/classify/classifier/svm_wrapper.h b/include/classify/classifier/svm_wrapper.h index 88244884b..0959e8105 100644 --- a/include/classify/classifier/svm_wrapper.h +++ b/include/classify/classifier/svm_wrapper.h @@ -29,6 +29,19 @@ namespace classify * submodule and have compiled both libsvm and liblinear. * * If no kernel is selected, liblinear is used. Otherwise, libsvm is used. + * + * Required config parameters: + * ~~~toml + * [classifier] + * method = "svm-wrapper" + * path = "path-to-libsvm-modules" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * [classifier] + * kernel = "quadratic" # or "none", "cubic", "quartic", "rbf", or "sigmoid" + * ~~~ */ class svm_wrapper : public classifier { diff --git a/include/classify/classifier/winnow.h b/include/classify/classifier/winnow.h index 32e1dc23f..d6571bdb2 100644 --- a/include/classify/classifier/winnow.h +++ b/include/classify/classifier/winnow.h @@ -26,6 +26,17 @@ namespace classify * Implements the Winnow classifier, a simplistic linear classifier for * linearly-separable data. As opposed to winnow (which uses an additive * update rule), winnow uses a multiplicative update rule. + * + * Required config parameters: none + * + * Optional config parameters: + * ~~~toml + * [classifier] + * method = "winnow" + * m = 1.5 + * gamma = 0.05 + * max-iter = 100 + * ~~~ */ class winnow : public classifier { From 051db379122bf3fec23aeae07f913c0c0b40999c Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 12 Sep 2015 12:07:23 -0500 Subject: [PATCH 275/481] document parser and sequence re: #91 --- src/parser/tools/parser_test.cpp | 3 +++ src/parser/tools/parser_train.cpp | 22 ++++++++++++++++++++++ src/sequence/crf/tools/crf-test.cpp | 3 +++ src/sequence/crf/tools/crf-train.cpp | 17 +++++++++++++++++ src/sequence/tools/greedy_tagger_test.cpp | 3 +++ src/sequence/tools/greedy_tagger_train.cpp | 18 +++++++++++++++++- 6 files changed, 65 insertions(+), 1 deletion(-) diff --git a/src/parser/tools/parser_test.cpp b/src/parser/tools/parser_test.cpp index 97e3eb76f..6e5afbfaa 100644 --- a/src/parser/tools/parser_test.cpp +++ b/src/parser/tools/parser_test.cpp @@ -24,6 +24,9 @@ std::string two_digit(uint8_t num) return ss.str(); } +/** + * For config parameters, see parser_train. + */ int main(int argc, char** argv) { diff --git a/src/parser/tools/parser_train.cpp b/src/parser/tools/parser_train.cpp index abbf83e9e..b33929f03 100644 --- a/src/parser/tools/parser_train.cpp +++ b/src/parser/tools/parser_train.cpp @@ -21,6 +21,28 @@ std::string two_digit(uint8_t num) return ss.str(); } +/** + * Required config parameters: + * ~~~toml + * prefix = "global-data-prefix" + * + * [parser] + * prefix = "path-to-model" + * treebank = "penn-treebank" # relative to data prefix + * corpus = "wsj" + * section-size = 99 + * train-sections = [0, 18] + * dev-sections = [19, 21] + * test-sections = [22, 24] + * ~~~ + * + * Optional config parameters: + * ~~~toml + * [parser] + * train-threads = 8 + * train-algorithm = "early-termination" # or "beam-search" + * ~~~ + */ int main(int argc, char** argv) { diff --git a/src/sequence/crf/tools/crf-test.cpp b/src/sequence/crf/tools/crf-test.cpp index cfface956..3ab3e65c1 100644 --- a/src/sequence/crf/tools/crf-test.cpp +++ b/src/sequence/crf/tools/crf-test.cpp @@ -15,6 +15,9 @@ std::string two_digit(uint8_t num) return ss.str(); } +/** + * For config params, see crf_train. + */ int main(int argc, char** argv) { if (argc < 2) diff --git a/src/sequence/crf/tools/crf-train.cpp b/src/sequence/crf/tools/crf-train.cpp index 17a398603..3eb15dde4 100644 --- a/src/sequence/crf/tools/crf-train.cpp +++ b/src/sequence/crf/tools/crf-train.cpp @@ -18,6 +18,23 @@ std::string two_digit(uint8_t num) return ss.str(); } +/** + * Required config parameters: + * ~~~toml + * prefix = "global-data-prefix" + * + * [crf] + * prefix = "path-to-model" + * treebank = "penn-treebank" # relative to data prefix + * corpus = "wsj" + * section-size = 99 + * train-sections = [0, 18] + * dev-sections = [19, 21] + * test-sections = [22, 24] + * ~~~ + * + * Optional config parameters: none + */ int main(int argc, char** argv) { if (argc < 2) diff --git a/src/sequence/tools/greedy_tagger_test.cpp b/src/sequence/tools/greedy_tagger_test.cpp index 26ef45ed7..abad870cb 100644 --- a/src/sequence/tools/greedy_tagger_test.cpp +++ b/src/sequence/tools/greedy_tagger_test.cpp @@ -22,6 +22,9 @@ std::string two_digit(uint8_t num) return ss.str(); } +/** + * For config params, see greedy_tagger_train. + */ int main(int argc, char** argv) { if (argc < 2) diff --git a/src/sequence/tools/greedy_tagger_train.cpp b/src/sequence/tools/greedy_tagger_train.cpp index 24d22bc94..e1e63a401 100644 --- a/src/sequence/tools/greedy_tagger_train.cpp +++ b/src/sequence/tools/greedy_tagger_train.cpp @@ -20,7 +20,23 @@ std::string two_digit(uint8_t num) ss << std::setw(2) << std::setfill('0') << static_cast(num); return ss.str(); } - +/** + * Required config parameters: + * ~~~toml + * prefix = "global-data-prefix" + * + * [sequence] + * prefix = "path-to-model" + * treebank = "penn-treebank" # relative to data prefix + * corpus = "wsj" + * section-size = 99 + * train-sections = [0, 18] + * dev-sections = [19, 21] + * test-sections = [22, 24] + * ~~~ + * + * Optional config parameters: none + */ int main(int argc, char** argv) { if (argc < 2) From 95e1d7a0f43970b8df6ff99478c106f0d817cf3b Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 12 Sep 2015 12:10:09 -0500 Subject: [PATCH 276/481] keep source and binary names consistent wrt dashes and underscores --- src/sequence/crf/tools/CMakeLists.txt | 4 ++-- src/sequence/crf/tools/{crf-test.cpp => crf_test.cpp} | 0 src/sequence/crf/tools/{crf-train.cpp => crf_train.cpp} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename src/sequence/crf/tools/{crf-test.cpp => crf_test.cpp} (100%) rename src/sequence/crf/tools/{crf-train.cpp => crf_train.cpp} (100%) diff --git a/src/sequence/crf/tools/CMakeLists.txt b/src/sequence/crf/tools/CMakeLists.txt index 0e57b1bd7..865b4d0d2 100644 --- a/src/sequence/crf/tools/CMakeLists.txt +++ b/src/sequence/crf/tools/CMakeLists.txt @@ -1,7 +1,7 @@ -add_executable(crf-train crf-train.cpp) +add_executable(crf-train crf_train.cpp) target_link_libraries(crf-train meta-crf) -add_executable(crf-test crf-test.cpp) +add_executable(crf-test crf_test.cpp) target_link_libraries(crf-test meta-crf meta-classify) add_executable(pos-tag pos_tag.cpp) diff --git a/src/sequence/crf/tools/crf-test.cpp b/src/sequence/crf/tools/crf_test.cpp similarity index 100% rename from src/sequence/crf/tools/crf-test.cpp rename to src/sequence/crf/tools/crf_test.cpp diff --git a/src/sequence/crf/tools/crf-train.cpp b/src/sequence/crf/tools/crf_train.cpp similarity index 100% rename from src/sequence/crf/tools/crf-train.cpp rename to src/sequence/crf/tools/crf_train.cpp From c6dc071bc9a59c16d04e1184a65c262300fc899a Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 12 Sep 2015 12:31:15 -0500 Subject: [PATCH 277/481] convert fixed_heap to use std::vector as the underlying container this allows it to be iterable (although not strictly sorted) for the parser and other applications --- include/util/fixed_heap.h | 36 +++++++++++++++++++++++--- include/util/fixed_heap.tcc | 51 ++++++++++++++++++++++++++++++------- 2 files changed, 74 insertions(+), 13 deletions(-) diff --git a/include/util/fixed_heap.h b/include/util/fixed_heap.h index fae715d3b..09eeb29d4 100644 --- a/include/util/fixed_heap.h +++ b/include/util/fixed_heap.h @@ -10,7 +10,7 @@ #ifndef META_FIXED_HEAP_H_ #define META_FIXED_HEAP_H_ -#include +#include namespace meta { @@ -24,6 +24,10 @@ template class fixed_heap { public: + using iterator = typename std::vector::iterator; + using const_iterator = typename std::vector::const_iterator; + using size_type = typename std::vector::size_type; + /** * @param max_elems * @param comp The priority comparison function for elements in this heap @@ -47,22 +51,46 @@ class fixed_heap * @return the current number of elements in this heap; will always be less * than or equal to max_elems() */ - uint64_t size() const; + size_type size() const; /** * @return the maximum number of elements this heap will store */ - uint64_t max_elems() const; + size_type max_elems() const; /** * @return a reverse-sorted list */ std::vector reverse_and_clear(); + /** + * @return an iterator to the beginning of the fixed_heap + * @note the heap is not fully sorted + */ + iterator begin(); + + /** + * @return an iterator to the end of the fixed_heap + * @note the heap is not fully sorted + */ + iterator end(); + + /** + * @return a const_iterator to the beginning of the fixed_heap + * @note the heap is not fully sorted + */ + const_iterator begin() const; + + /** + * @return a const_iterator to the end of the fixed_heap + * @note the heap is not fully sorted + */ + const_iterator end() const; + private: uint64_t max_elems_; Comp comp_; - std::priority_queue, decltype(comp_)> pq_; + std::vector pq_; }; } } diff --git a/include/util/fixed_heap.tcc b/include/util/fixed_heap.tcc index 2ed01a902..fdc923d1f 100644 --- a/include/util/fixed_heap.tcc +++ b/include/util/fixed_heap.tcc @@ -11,7 +11,7 @@ namespace util { template fixed_heap::fixed_heap(uint64_t max_elems, Comp comp) - : max_elems_{max_elems}, comp_(comp), pq_{comp} + : max_elems_{max_elems}, comp_(comp) { // nothing } @@ -20,27 +20,35 @@ template template void fixed_heap::emplace(Args&&... args) { - pq_.emplace(std::forward(args)...); + pq_.emplace_back(std::forward(args)...); + std::push_heap(pq_.begin(), pq_.end(), comp_); if (size() > max_elems()) - pq_.pop(); + { + std::pop_heap(pq_.begin(), pq_.end(), comp_); + pq_.pop_back(); + } } template void fixed_heap::push(const T& elem) { - pq_.push(elem); + pq_.push_back(elem); + std::push_heap(pq_.begin(), pq_.end(), comp_); if (size() > max_elems()) - pq_.pop(); + { + std::pop_heap(pq_.begin(), pq_.end(), comp_); + pq_.pop_back(); + } } template -uint64_t fixed_heap::size() const +auto fixed_heap::size() const -> size_type { return pq_.size(); } template -uint64_t fixed_heap::max_elems() const +auto fixed_heap::max_elems() const -> size_type { return max_elems_; } @@ -52,11 +60,36 @@ std::vector fixed_heap::reverse_and_clear() sorted.reserve(size()); while (!pq_.empty()) { - sorted.emplace_back(std::move(pq_.top())); - pq_.pop(); + sorted.emplace_back(std::move(pq_.front())); + std::pop_heap(pq_.begin(), pq_.end(), comp_); + pq_.pop_back(); } std::reverse(sorted.begin(), sorted.end()); return sorted; } + +template +typename fixed_heap::iterator fixed_heap::begin() +{ + return pq_.begin(); +} + +template +typename fixed_heap::iterator fixed_heap::end() +{ + return pq_.end(); +} + +template +typename fixed_heap::const_iterator fixed_heap::begin() const +{ + return pq_.cbegin(); +} + +template +typename fixed_heap::const_iterator fixed_heap::end() const +{ + return pq_.cend(); +} } } From e45f526b8c96a7839f8157f4873d387d0037ba07 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 12 Sep 2015 14:39:13 -0500 Subject: [PATCH 278/481] fixed_heap documentation/API improvements. - Add more documentation about the expectation of the comparison function - Remove non-const iteration support (that can invalidate the heap) - Change reverse_and_clear() to extract_top() (it's not reversing things from the user's perspective; new name makes its purpose more clear). --- include/util/fixed_heap.h | 25 +++++++++++-------------- include/util/fixed_heap.tcc | 29 +++++++---------------------- src/index/ranker/ranker.cpp | 2 +- src/lm/diff.cpp | 2 +- src/lm/language_model.cpp | 2 +- src/tools/top_k.cpp | 2 +- 6 files changed, 22 insertions(+), 40 deletions(-) diff --git a/include/util/fixed_heap.h b/include/util/fixed_heap.h index 09eeb29d4..a840f9415 100644 --- a/include/util/fixed_heap.h +++ b/include/util/fixed_heap.h @@ -19,6 +19,14 @@ namespace util /** * Keeps a constant number of high-priority elements. This is useful for finding * the "top-k" T elements using the comparison function Comp. + * + * Internally, this class maintains a min-heap of max size `max_elems`, + * meaning that a push/emplace takes \f$O(\log k)\f$ where \f$k\f$ is + * `max_elems` passed on construction. + * + * Your comparison function should be the same as you would pass to e.g. + * std::sort such that the result would be the elements in **descending** + * order. */ template class fixed_heap @@ -59,21 +67,10 @@ class fixed_heap size_type max_elems() const; /** - * @return a reverse-sorted list - */ - std::vector reverse_and_clear(); - - /** - * @return an iterator to the beginning of the fixed_heap - * @note the heap is not fully sorted - */ - iterator begin(); - - /** - * @return an iterator to the end of the fixed_heap - * @note the heap is not fully sorted + * Clears the heap and returns the top elements + * @return the top elements in sorted order */ - iterator end(); + std::vector extract_top(); /** * @return a const_iterator to the beginning of the fixed_heap diff --git a/include/util/fixed_heap.tcc b/include/util/fixed_heap.tcc index fdc923d1f..4010484e2 100644 --- a/include/util/fixed_heap.tcc +++ b/include/util/fixed_heap.tcc @@ -3,6 +3,7 @@ * @author Sean Massung */ +#include #include namespace meta @@ -54,30 +55,14 @@ auto fixed_heap::max_elems() const -> size_type } template -std::vector fixed_heap::reverse_and_clear() +std::vector fixed_heap::extract_top() { - std::vector sorted; - sorted.reserve(size()); - while (!pq_.empty()) - { - sorted.emplace_back(std::move(pq_.front())); - std::pop_heap(pq_.begin(), pq_.end(), comp_); - pq_.pop_back(); - } - std::reverse(sorted.begin(), sorted.end()); - return sorted; -} + for (size_type i = 0; i < pq_.size(); ++i) + std::pop_heap(pq_.begin(), pq_.end() - i, comp_); -template -typename fixed_heap::iterator fixed_heap::begin() -{ - return pq_.begin(); -} - -template -typename fixed_heap::iterator fixed_heap::end() -{ - return pq_.end(); + std::vector result = std::move(pq_); + assert(pq_.empty()); + return result; } template diff --git a/src/index/ranker/ranker.cpp b/src/index/ranker/ranker.cpp index 1ee924e43..85b8a0622 100644 --- a/src/index/ranker/ranker.cpp +++ b/src/index/ranker/ranker.cpp @@ -86,7 +86,7 @@ std::vector ranker::rank(detail::ranker_context& ctx, next_doc = doc_id{ctx.idx.num_docs()}; } - return results.reverse_and_clear(); + return results.extract_top(); } float ranker::initial_score(const score_data&) const diff --git a/src/lm/diff.cpp b/src/lm/diff.cpp index 3eb847692..ba19d1c2d 100644 --- a/src/lm/diff.cpp +++ b/src/lm/diff.cpp @@ -61,7 +61,7 @@ std::vector> seen_.clear(); add(candidates, sent); step(sent, candidates, 0); - return candidates.reverse_and_clear(); + return candidates.extract_top(); } template diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index f99a2fbcc..f98458cf1 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -129,7 +129,7 @@ std::vector> candidates.emplace(word, log_prob(candidate)); } - return candidates.reverse_and_clear(); + return candidates.extract_top(); } void language_model::load_vocab() diff --git a/src/tools/top_k.cpp b/src/tools/top_k.cpp index fc19e0589..8a7f3ee28 100644 --- a/src/tools/top_k.cpp +++ b/src/tools/top_k.cpp @@ -64,7 +64,7 @@ int main(int argc, char* argv[]) for (auto& term : counts) terms.emplace(term); - auto sorted = terms.reverse_and_clear(); + auto sorted = terms.extract_top(); for (const auto& it : sorted) std::cout << it.first << "\t" << it.second << std::endl; } From 4b458e00b5ab44e4162f5bf6dbdcb4af2d2fd3cf Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 12 Sep 2015 14:41:36 -0500 Subject: [PATCH 279/481] Switch to fixed_heap in remaining push/pop_heap() locations. --- include/classify/models/linear_model.tcc | 20 ++---- src/parser/sr_parser.cpp | 91 +++++++++++------------- src/topics/tools/lda-topics.cpp | 17 ++--- 3 files changed, 54 insertions(+), 74 deletions(-) diff --git a/include/classify/models/linear_model.tcc b/include/classify/models/linear_model.tcc index 0ac2d25f1..2fba23829 100644 --- a/include/classify/models/linear_model.tcc +++ b/include/classify/models/linear_model.tcc @@ -6,11 +6,13 @@ * consult the file LICENSE in the root of the project. */ +#include #include #include "classify/models/linear_model.h" #include "io/binary.h" #include "logging/logger.h" +#include "util/fixed_heap.h" namespace meta { @@ -156,27 +158,15 @@ auto linear_model::best_classes( return lhs.second > rhs.second; }; - std::vector result; + util::fixed_heap heap{num, comp}; for (const auto& score : class_scores) { auto cid = score.first; if (filter(cid)) - result.push_back(score); + heap.emplace(score); } - std::make_heap(result.begin(), result.end(), comp); - while (result.size() > num) - { - std::pop_heap(result.begin(), result.end(), comp); - result.pop_back(); - } - - std::sort(result.begin(), result.end(), - [](const scored_class& lhs, const scored_class& rhs) - { - return lhs.second < rhs.second; - }); - return result; + return heap.extract_top(); } template diff --git a/src/parser/sr_parser.cpp b/src/parser/sr_parser.cpp index b8865daa6..82683fb30 100644 --- a/src/parser/sr_parser.cpp +++ b/src/parser/sr_parser.cpp @@ -17,6 +17,7 @@ #include "parser/trees/leaf_node.h" #include "parser/trees/visitors/debinarizer.h" #include "util/filesystem.h" +#include "util/fixed_heap.h" #include "util/progress.h" #include "util/range.h" #include "util/time.h" @@ -78,12 +79,14 @@ parse_tree sr_parser::parse(const sequence::sequence& sentence) const return std::get<0>(ss).finalized(); }; - std::vector agenda; - agenda.emplace_back(st, 0); + using fixed_heap = util::fixed_heap; + + fixed_heap agenda{beam_size_, comp}; + agenda.emplace(st, 0); while (!std::all_of(agenda.begin(), agenda.end(), fin)) { - std::vector new_agenda; + fixed_heap new_agenda{beam_size_, comp}; for (const auto& ss : agenda) { @@ -100,16 +103,8 @@ parse_tree sr_parser::parse(const sequence::sequence& sentence) const auto trans = std::get<0>(scored_trans); auto t_score = std::get<1>(scored_trans); - new_agenda.emplace_back(c_state.advance(trans_.at(trans)), - score + t_score); - std::push_heap(new_agenda.begin(), new_agenda.end(), comp); - - if (new_agenda.size() > beam_size_) - { - std::pop_heap(new_agenda.begin(), new_agenda.end(), - comp); - new_agenda.pop_back(); - } + new_agenda.emplace(c_state.advance(trans_.at(trans)), + score + t_score); } } @@ -121,7 +116,7 @@ parse_tree sr_parser::parse(const sequence::sequence& sentence) const auto score = std::get<1>(ss); auto trans = c_state.emergency_transition(); - new_agenda.emplace_back(c_state.advance(trans), score); + new_agenda.emplace(c_state.advance(trans), score); } } @@ -165,7 +160,7 @@ void sr_parser::train(std::vector& trees, training_options options) [&]() { printing::progress progress{" > Iteration " - + std::to_string(iter) + ": ", + + std::to_string(iter) + ": ", trees.size()}; data.shuffle(); @@ -218,16 +213,17 @@ auto sr_parser::train_batch(training_batch batch, parallel::thread_pool& pool, std::atomic num_correct{0}; std::atomic num_incorrect{0}; - parallel::parallel_for(range.begin(), range.end(), pool, [&](size_t i) - { - auto& tree = batch.data.tree(i); - auto& transitions = batch.data.transitions(i); - auto& update = updates[std::this_thread::get_id()]; + parallel::parallel_for( + range.begin(), range.end(), pool, [&](size_t i) + { + auto& tree = batch.data.tree(i); + auto& transitions = batch.data.transitions(i); + auto& update = updates[std::this_thread::get_id()]; - auto res = train_instance(tree, transitions, options, update); - num_correct += res.first; - num_incorrect += res.second; - }); + auto res = train_instance(tree, transitions, options, update); + num_correct += res.first; + num_incorrect += res.second; + }); // Reduce partial results down to final update vector for (const auto& thread_update : updates) @@ -313,12 +309,14 @@ std::pair sr_parser::train_beam_search( return std::get<1>(a) > std::get<1>(b); }; - std::vector agenda; - agenda.emplace_back(state{tree}, 0, true); + using fixed_heap = util::fixed_heap; + + fixed_heap agenda{options.beam_size, score_compare}; + agenda.emplace(state{tree}, 0, true); for (const auto& gold_trans : transitions) { - std::vector new_agenda; + fixed_heap new_agenda{options.beam_size, score_compare}; // keep track if any of the new states is the gold one bool any_gold = false; @@ -366,16 +364,7 @@ std::pair sr_parser::train_beam_search( best_trans = trans; } - new_agenda.emplace_back(new_state, new_score, new_is_gold); - std::push_heap(new_agenda.begin(), new_agenda.end(), - score_compare); - - if (new_agenda.size() > options.beam_size) - { - std::pop_heap(new_agenda.begin(), new_agenda.end(), - score_compare); - new_agenda.pop_back(); - } + new_agenda.emplace(new_state, new_score, new_is_gold); } } @@ -414,24 +403,28 @@ std::pair sr_parser::train_beam_search( return result; } -auto sr_parser::best_transition( - const feature_vector& features, const state& state, - bool check_legality /* = false */) const -> trans_id +auto sr_parser::best_transition(const feature_vector& features, + const state& state, + bool check_legality /* = false */) const + -> trans_id { return model_.best_class(features, [&](trans_id tid) - { - return !check_legality || state.legal(trans_.at(tid)); - }); + { + return !check_legality + || state.legal(trans_.at(tid)); + }); } -auto sr_parser::best_transitions( - const feature_vector& features, const state& state, size_t num, - bool check_legality) const -> std::vector +auto sr_parser::best_transitions(const feature_vector& features, + const state& state, size_t num, + bool check_legality) const + -> std::vector { return model_.best_classes(features, num, [&](trans_id tid) - { - return !check_legality || state.legal(trans_.at(tid)); - }); + { + return !check_legality + || state.legal(trans_.at(tid)); + }); } void sr_parser::save(const std::string& prefix) const diff --git a/src/topics/tools/lda-topics.cpp b/src/topics/tools/lda-topics.cpp index a494ed571..a9305d096 100644 --- a/src/topics/tools/lda-topics.cpp +++ b/src/topics/tools/lda-topics.cpp @@ -5,6 +5,7 @@ #include "caching/no_evict_cache.h" #include "index/forward_index.h" +#include "util/fixed_heap.h" using namespace meta; @@ -37,12 +38,15 @@ int print_topics(const std::string& config_file, const std::string& filename, stream >> topic; std::cout << "Topic " << topic << ":" << std::endl; std::cout << "-----------------------" << std::endl; - std::vector> pairs; + auto comp = [](const std::pair& first, const std::pair& second) { return first.second > second.second; }; + util::fixed_heap, decltype(comp)> pairs{ + num_words, comp}; + while (stream) { std::string to_split; @@ -52,16 +56,9 @@ int print_topics(const std::string& config_file, const std::string& filename, size_t idx = to_split.find_first_of(':'); term_id term{std::stoul(to_split.substr(0, idx))}; double prob = std::stod(to_split.substr(idx + 1)); - pairs.emplace_back(term, prob); - std::push_heap(pairs.begin(), pairs.end(), comp); - if (pairs.size() > num_words) - { - std::pop_heap(pairs.begin(), pairs.end(), comp); - pairs.pop_back(); - } + pairs.emplace(term, prob); } - std::sort(pairs.begin(), pairs.end(), comp); - for (const auto& p : pairs) + for (const auto& p : pairs.extract_top()) std::cout << idx->term_text(p.first) << " (" << p.first << "): " << p.second << std::endl; std::cout << std::endl; From 449647f2ec8b55565593f4d4f6a91e84dd4c8a02 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 12 Sep 2015 15:08:41 -0500 Subject: [PATCH 280/481] Fix build with GCC (whose lambdas are noncopyable, apparently). --- src/parser/sr_parser.cpp | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/parser/sr_parser.cpp b/src/parser/sr_parser.cpp index 82683fb30..8a3a5b3d7 100644 --- a/src/parser/sr_parser.cpp +++ b/src/parser/sr_parser.cpp @@ -69,24 +69,29 @@ parse_tree sr_parser::parse(const sequence::sequence& sentence) const else { using scored_state = std::pair; - auto comp = [&](const scored_state& lhs, const scored_state& rhs) + + struct comparator { - return std::get<1>(lhs) > std::get<1>(rhs); + bool operator()(const scored_state& lhs, + const scored_state& rhs) const + { + return std::get<1>(lhs) > std::get<1>(rhs); + }; }; - auto fin = [&](const scored_state& ss) + auto fin = [](const scored_state& ss) { return std::get<0>(ss).finalized(); }; - using fixed_heap = util::fixed_heap; + using fixed_heap = util::fixed_heap; - fixed_heap agenda{beam_size_, comp}; + fixed_heap agenda{beam_size_, comparator{}}; agenda.emplace(st, 0); while (!std::all_of(agenda.begin(), agenda.end(), fin)) { - fixed_heap new_agenda{beam_size_, comp}; + fixed_heap new_agenda{beam_size_, comparator{}}; for (const auto& ss : agenda) { @@ -127,7 +132,8 @@ parse_tree sr_parser::parse(const sequence::sequence& sentence) const } // min because comp is backwards - auto best = std::min_element(agenda.begin(), agenda.end(), comp); + auto best + = std::min_element(agenda.begin(), agenda.end(), comparator{}); parse_tree tree{std::get<0>(*best).stack_item(0)->clone()}; debinarizer debin; @@ -304,19 +310,22 @@ std::pair sr_parser::train_beam_search( // get<1>() is the score // get<2>() is whether or not it is the same as the gold state - auto score_compare = [](const scored_state& a, const scored_state& b) + struct score_compare { - return std::get<1>(a) > std::get<1>(b); + bool operator()(const scored_state& a, const scored_state& b) const + { + return std::get<1>(a) > std::get<1>(b); + } }; - using fixed_heap = util::fixed_heap; + using fixed_heap = util::fixed_heap; - fixed_heap agenda{options.beam_size, score_compare}; + fixed_heap agenda{options.beam_size, score_compare{}}; agenda.emplace(state{tree}, 0, true); for (const auto& gold_trans : transitions) { - fixed_heap new_agenda{options.beam_size, score_compare}; + fixed_heap new_agenda{options.beam_size, score_compare{}}; // keep track if any of the new states is the gold one bool any_gold = false; From 7b663fa92f9822ddaa2aad1feeac423d90a73c7f Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 12 Sep 2015 15:56:27 -0500 Subject: [PATCH 281/481] use vectors of term_ids instead of strings in language_model calculations --- include/lm/language_model.h | 16 +++-- include/lm/static_probe_map.h | 9 +-- include/lm/token_list.h | 118 ++++++++++++++++++++++++++++++++++ src/lm/CMakeLists.txt | 1 + src/lm/language_model.cpp | 47 +++++++++----- src/lm/static_probe_map.cpp | 8 +-- src/lm/token_list.cpp | 80 +++++++++++++++++++++++ 7 files changed, 251 insertions(+), 28 deletions(-) create mode 100644 include/lm/token_list.h create mode 100644 src/lm/token_list.cpp diff --git a/include/lm/language_model.h b/include/lm/language_model.h index 2b954adc8..896c2008f 100644 --- a/include/lm/language_model.h +++ b/include/lm/language_model.h @@ -17,6 +17,7 @@ #include "cpptoml.h" #include "lm/sentence.h" #include "lm/static_probe_map.h" +#include "lm/token_list.h" namespace meta { @@ -77,7 +78,7 @@ class language_model * @param tokens A sequence of n tokens (one sentence) * @return the log probability of the likelihood of this sentence */ - float log_prob(sentence tokens) const; + float log_prob(const sentence& tokens) const; /** * @param prev Seen tokens to base the next token off of @@ -100,10 +101,15 @@ class language_model * @param tokens * @return the log probability of one ngram */ - float prob_calc(sentence tokens) const; + float prob_calc(token_list tokens) const; /** - * Loads unigram vocabulary from text file to allow top_k to work. + * Internal log_prob that takes a token_list + */ + float log_prob(const token_list& tokens) const; + + /** + * Loads unigram vocabulary from text file */ void load_vocab(); @@ -111,9 +117,11 @@ class language_model std::vector lm_; - std::vector vocabulary_; + std::unordered_map vocabulary_; std::string prefix_; + + float unk_prob_; }; class language_model_exception : public std::runtime_error diff --git a/include/lm/static_probe_map.h b/include/lm/static_probe_map.h index 2a812b264..96b20db40 100644 --- a/include/lm/static_probe_map.h +++ b/include/lm/static_probe_map.h @@ -12,6 +12,7 @@ #include #include "lm/lm_node.h" +#include "lm/token_list.h" #include "util/disk_vector.h" #include "util/optional.h" @@ -51,7 +52,7 @@ class static_probe_map * @return an optional language model node containing the probability and * backoff value for the key */ - util::optional find(const std::string& key) const; + util::optional find(const token_list& key) const; /** * @param key The string key to insert (though only a uint64_t hash is @@ -59,13 +60,13 @@ class static_probe_map * @param prob The probability of the key in this LM * @param backoff The backoff probability for this LM */ - void insert(const std::string& key, float prob, float backoff); + void insert(const token_list& key, float prob, float backoff); private: /** - * Helper function to create hasher and hash str + * Helper function to create hasher and hash token list */ - uint64_t hash(const std::string& str) const; + uint64_t hash(const token_list& tokens) const; /// A seed for the string hash function static constexpr uint64_t seed_ = 0x2bedf99b3aa222d9; diff --git a/include/lm/token_list.h b/include/lm/token_list.h new file mode 100644 index 000000000..f0442026d --- /dev/null +++ b/include/lm/token_list.h @@ -0,0 +1,118 @@ +/** + * @file token_list.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_LM_TOKEN_LIST_H_ +#define META_LM_TOKEN_LIST_H_ + +#include +#include +#include "meta.h" +#include "lm/sentence.h" +#include "util/hash.h" + +namespace meta +{ +namespace lm +{ +class token_list +{ + public: + /** + * Constructor that takes a std::string, splits it, and assigns ids to each + * token based on vocab + * @param ngram + * @param vocab + */ + token_list(const std::string& ngram, + const std::unordered_map& vocab); + + /** + * Constructor that takes an lm::sentence and assigns ids to each one based + * on vocab + * @param ngram + * @param vocab + */ + token_list(const lm::sentence& ngram, + const std::unordered_map& vocab); + + /** + * Constructor that creates a token list with a single element + * @param val + */ + token_list(term_id val); + + /** + * Default constructor. + */ + token_list() = default; + + /** + * @param idx + * @return the token id of the element at location idx + */ + const term_id& operator[](uint64_t idx) const; + + /** + * @param idx + * @return the token id of the element at location idx + */ + term_id& operator[](uint64_t idx); + + /** + * @return the number of tokens in this list + */ + uint64_t size() const; + + /** + * Add elem to the end of the list + * @param elem + */ + void push_back(term_id elem); + + /** + * Remove the first token + */ + void pop_front(); + + /** + * Remove the last token + */ + void pop_back(); + + /** + * @return the underlying container of term_id tokens + */ + const std::vector tokens() const; + + private: + std::vector tokens_; +}; + +inline bool operator==(const token_list& lhs, const token_list& rhs) +{ + return lhs.tokens() == rhs.tokens(); +} + +inline bool operator!=(const token_list& lhs, const token_list& rhs) +{ + return !(lhs == rhs); +} + +template +void hash_append(HashAlgorithm& h, const token_list& list) +{ + using util::hash_append; + for (const auto& val : list.tokens()) + hash_append(h, val); + hash_append(h, list.size()); +} +} +} + +#endif diff --git a/src/lm/CMakeLists.txt b/src/lm/CMakeLists.txt index 464d2f879..3cf4bff20 100644 --- a/src/lm/CMakeLists.txt +++ b/src/lm/CMakeLists.txt @@ -4,6 +4,7 @@ add_subdirectory(tools) add_subdirectory(analyzers) add_library(meta-language-model language_model.cpp + token_list.cpp diff.cpp static_probe_map.cpp sentence.cpp) diff --git a/src/lm/language_model.cpp b/src/lm/language_model.cpp index f99a2fbcc..c2f94e2d8 100644 --- a/src/lm/language_model.cpp +++ b/src/lm/language_model.cpp @@ -34,13 +34,14 @@ language_model::language_model(const cpptoml::table& config) auto time = common::time( [&]() { + prefix_ = *binary_file; + load_vocab(); while (filesystem::file_exists(*binary_file + std::to_string(N_) + ".binlm")) lm_.emplace_back(*binary_file + std::to_string(N_++) + ".binlm"); }); LOG(info) << "Done. (" << time.count() << "ms)" << ENDLG; - prefix_ = *binary_file; } else if (arpa_file && binary_file) { @@ -57,7 +58,8 @@ language_model::language_model(const cpptoml::table& config) throw language_model_exception{ "arpa-file or binary-file-prefix needed in config file"}; - load_vocab(); + // cache this value + unk_prob_ = lm_[0].find(token_list{"", vocabulary_})->prob; } void language_model::read_arpa_format(const std::string& arpa_file) @@ -81,6 +83,7 @@ void language_model::read_arpa_format(const std::string& arpa_file) lm_.emplace_back(prefix_ + std::to_string(N_) + ".binlm", count[N_]); std::ofstream unigrams{prefix_ + "0.strings"}; + term_id unigram_id{0}; while (std::getline(infile, buffer)) { // if blank or end @@ -103,10 +106,14 @@ void language_model::read_arpa_format(const std::string& arpa_file) float backoff = 0.0; if (second_tab != std::string::npos) backoff = std::stof(buffer.substr(second_tab + 1)); - lm_[N_].insert(ngram, prob, backoff); if (N_ == 0) + { unigrams << ngram << std::endl; + vocabulary_.emplace(ngram, unigram_id++); + } + + lm_[N_].insert(token_list{ngram, vocabulary_}, prob, backoff); } ++N_; @@ -123,10 +130,12 @@ std::vector> }; util::fixed_heap candidates{k, comp}; + token_list candidate{prev, vocabulary_}; + candidate.push_back(0_tid); for (const auto& word : vocabulary_) { - auto candidate = sentence{prev.to_string() + " " + word}; - candidates.emplace(word, log_prob(candidate)); + candidate[candidate.size() - 1] = word.second; + candidates.emplace(word.first, log_prob(candidate)); } return candidates.reverse_and_clear(); @@ -136,56 +145,62 @@ void language_model::load_vocab() { std::string word; std::ifstream unigrams{prefix_ + "0.strings"}; + term_id cur{0}; while (std::getline(unigrams, word)) { if (word.empty()) continue; - vocabulary_.push_back(word); + vocabulary_.emplace(word, cur++); } } -float language_model::prob_calc(sentence tokens) const +float language_model::prob_calc(token_list tokens) const { if (tokens.size() == 0) throw language_model_exception{"prob_calc: tokens is empty!"}; if (tokens.size() == 1) { - auto opt = lm_[0].find(tokens[0]); + auto opt = lm_[0].find(token_list{tokens[0]}); if (opt) return opt->prob; - return lm_[0].find("")->prob; + return unk_prob_; } else { - auto opt = lm_[tokens.size() - 1].find(tokens.to_string()); + auto opt = lm_[tokens.size() - 1].find(tokens); if (opt) return opt->prob; - auto hist = tokens(0, tokens.size() - 1); + auto hist = tokens; + hist.pop_back(); tokens.pop_front(); if (tokens.size() == 1) { - hist = hist(0, 1); auto opt = lm_[0].find(hist[0]); if (!opt) - hist.substitute(0, ""); + hist[0] = vocabulary_.at(""); } - opt = lm_[hist.size() - 1].find(hist.to_string()); + opt = lm_[hist.size() - 1].find(hist); if (opt) return opt->backoff + prob_calc(tokens); return prob_calc(tokens); } } -float language_model::log_prob(sentence tokens) const +float language_model::log_prob(const sentence& tokens) const +{ + return log_prob(token_list{tokens, vocabulary_}); +} + +float language_model::log_prob(const token_list& tokens) const { float prob = 0.0f; // tokens < N - sentence ngram; + token_list ngram; for (uint64_t i = 0; i < N_ - 1 && i < tokens.size(); ++i) { ngram.push_back(tokens[i]); diff --git a/src/lm/static_probe_map.cpp b/src/lm/static_probe_map.cpp index ddd536f1e..f928db475 100644 --- a/src/lm/static_probe_map.cpp +++ b/src/lm/static_probe_map.cpp @@ -17,7 +17,7 @@ static_probe_map::static_probe_map(const std::string& filename, { } -void static_probe_map::insert(const std::string& key, float prob, float backoff) +void static_probe_map::insert(const token_list& key, float prob, float backoff) { auto hashed = hash(key); auto idx = (hashed % (table_.size() / 2)) * 2; @@ -39,7 +39,7 @@ void static_probe_map::insert(const std::string& key, float prob, float backoff) } } -util::optional static_probe_map::find(const std::string& key) const +util::optional static_probe_map::find(const token_list& key) const { auto hashed = hash(key); auto idx = (hashed % (table_.size() / 2)) * 2; @@ -56,10 +56,10 @@ util::optional static_probe_map::find(const std::string& key) const } } -uint64_t static_probe_map::hash(const std::string& str) const +uint64_t static_probe_map::hash(const token_list& tokens) const { util::murmur_hash<> hasher{seed_}; - hasher(str.data(), str.length()); + hasher(tokens.tokens().data(), tokens.size() * sizeof(term_id)); return static_cast(hasher); } } diff --git a/src/lm/token_list.cpp b/src/lm/token_list.cpp new file mode 100644 index 000000000..42fe12cef --- /dev/null +++ b/src/lm/token_list.cpp @@ -0,0 +1,80 @@ +/** + * @file token_list.cpp + * @author Sean Massung + */ + +#include "lm/token_list.h" + +namespace meta +{ +namespace lm +{ +token_list::token_list(const std::string& ngram, + const std::unordered_map& vocab) +{ + std::string token; + std::istringstream iss{ngram}; + while (iss >> token) + { + auto it = vocab.find(token); + if (it != vocab.end()) + tokens_.push_back(it->second); + else + tokens_.push_back(vocab.at("")); + } +} + +token_list::token_list(const lm::sentence& ngram, + const std::unordered_map& vocab) +{ + for (const auto& token : ngram) + { + auto it = vocab.find(token); + if (it != vocab.end()) + tokens_.push_back(it->second); + else + tokens_.push_back(vocab.at("")); + } +} + +token_list::token_list(term_id val) +{ + tokens_.push_back(val); +} + +const term_id& token_list::operator[](uint64_t idx) const +{ + return tokens_[idx]; +} + +term_id& token_list::operator[](uint64_t idx) +{ + return tokens_[idx]; +} + +uint64_t token_list::size() const +{ + return tokens_.size(); +} + +void token_list::push_back(term_id elem) +{ + tokens_.push_back(elem); +} + +void token_list::pop_front() +{ + tokens_.erase(tokens_.begin()); +} + +void token_list::pop_back() +{ + tokens_.pop_back(); +} + +const std::vector token_list::tokens() const +{ + return tokens_; +} +} +} From 228460e36de2291ac5b32e3fe9144cbdf0df4f05 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sat, 12 Sep 2015 16:26:25 -0500 Subject: [PATCH 282/481] update hashing syntax for token_list and static_probe_map --- include/lm/token_list.h | 4 +--- src/lm/static_probe_map.cpp | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/lm/token_list.h b/include/lm/token_list.h index f0442026d..529edab95 100644 --- a/include/lm/token_list.h +++ b/include/lm/token_list.h @@ -107,9 +107,7 @@ inline bool operator!=(const token_list& lhs, const token_list& rhs) template void hash_append(HashAlgorithm& h, const token_list& list) { - using util::hash_append; - for (const auto& val : list.tokens()) - hash_append(h, val); + h(list.tokens().data(), sizeof(term_id) * list.size()); hash_append(h, list.size()); } } diff --git a/src/lm/static_probe_map.cpp b/src/lm/static_probe_map.cpp index f928db475..96e896d88 100644 --- a/src/lm/static_probe_map.cpp +++ b/src/lm/static_probe_map.cpp @@ -59,7 +59,7 @@ util::optional static_probe_map::find(const token_list& key) const uint64_t static_probe_map::hash(const token_list& tokens) const { util::murmur_hash<> hasher{seed_}; - hasher(tokens.tokens().data(), tokens.size() * sizeof(term_id)); + hash_append(hasher, tokens); return static_cast(hasher); } } From 510cdd473ffffe11e8abd3054c0c7da3c81da2ae Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 12 Sep 2015 16:38:52 -0500 Subject: [PATCH 283/481] Switch remaining uses of io::read/write_binary to io::packed. Model files will need regenerated. --- include/classify/models/linear_model.tcc | 22 +++--- include/io/packed.h | 91 ++++++++++++++++++------ include/stats/dirichlet.tcc | 1 - src/classify/classifier/naive_bayes.cpp | 5 +- src/index/metadata_writer.cpp | 21 +++--- src/parser/sr_parser.cpp | 6 +- src/parser/transition_map.cpp | 17 +++-- src/sequence/sequence_analyzer.cpp | 15 ++-- 8 files changed, 113 insertions(+), 65 deletions(-) diff --git a/include/classify/models/linear_model.tcc b/include/classify/models/linear_model.tcc index 2fba23829..be7ec9711 100644 --- a/include/classify/models/linear_model.tcc +++ b/include/classify/models/linear_model.tcc @@ -10,7 +10,7 @@ #include #include "classify/models/linear_model.h" -#include "io/binary.h" +#include "io/packed.h" #include "logging/logger.h" #include "util/fixed_heap.h" @@ -26,7 +26,7 @@ void linear_model::load(std::istream& model) throw exception{"model not found"}; uint64_t num_feats; - io::read_binary(model, num_feats); + io::packed::read(model, num_feats); for (uint64_t i = 0; i < num_feats; ++i) { @@ -34,10 +34,10 @@ void linear_model::load(std::istream& model) throw exception{"malformed model file (too few features written)"}; feature_id feature_name; - io::read_binary(model, feature_name); + io::packed::read(model, feature_name); uint64_t num_cids; - io::read_binary(model, num_cids); + io::packed::read(model, num_cids); for (uint64_t j = 0; j < num_cids; ++j) { @@ -47,8 +47,8 @@ void linear_model::load(std::istream& model) class_id cid; feature_value val; - io::read_binary(model, cid); - io::read_binary(model, val); + io::packed::read(model, cid); + io::packed::read(model, val); weights_[feature_name][cid] = val; } @@ -60,20 +60,20 @@ void linear_model::save( std::ostream& model) const { uint64_t sze = weights_.size(); - io::write_binary(model, sze); + io::packed::write(model, sze); for (const auto& feat_vec : weights_) { const auto& feat = feat_vec.first; const auto& weights = feat_vec.second; - io::write_binary(model, feat); + io::packed::write(model, feat); uint64_t size = weights.size(); - io::write_binary(model, size); + io::packed::write(model, size); for (const auto& weight : weights) { - io::write_binary(model, weight.first); - io::write_binary(model, weight.second); + io::packed::write(model, weight.first); + io::packed::write(model, weight.second); } } } diff --git a/include/io/packed.h b/include/io/packed.h index a2089d476..507f60eee 100644 --- a/include/io/packed.h +++ b/include/io/packed.h @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include "util/string_view.h" namespace meta { @@ -31,8 +32,11 @@ namespace packed * @param value The value to write * @return the number of bytes used to write out the value */ -template -uint64_t write(OutputStream& stream, uint64_t value) +template +typename std::enable_if::value + && std::is_unsigned::value, + uint64_t>::type + write(OutputStream& stream, T value) { uint64_t size = 0; while (value > 127) @@ -56,8 +60,11 @@ uint64_t write(OutputStream& stream, uint64_t value) * @param value The value to write * @return the number of bytes used to write out the value */ -template -uint64_t write(OutputStream& stream, int64_t value) +template +typename std::enable_if::value + && std::is_signed::value, + uint64_t>::type + write(OutputStream& stream, T value) { uint64_t elem = (value << 1) ^ (value >> 63); return write(stream, elem); @@ -69,20 +76,22 @@ uint64_t write(OutputStream& stream, int64_t value) * mantissa * std::pow(2.0, exponent) == elem. The mantissa and exponent * are integers are are written using the integer packed format. * - * @see http://stackoverflow.com/questions/5672960/how-can-i-extract-the-mantissa-of-a-double + * @see + *http://stackoverflow.com/questions/5672960/how-can-i-extract-the-mantissa-of-a-double * @see http://dlib.net/dlib/float_details.h.html * * @param stream The stream to write to * @param value The value to write * @return the number of bytes used to write out the value */ -template -uint64_t write(OutputStream& stream, double value) +template +typename std::enable_if::value, uint64_t>::type + write(OutputStream& stream, T value) { int exp; - auto digits = std::numeric_limits::digits; - auto mantissa - = static_cast(std::frexp(value, &exp) * (uint64_t{1} << digits)); + auto digits = std::numeric_limits::digits; + auto mantissa = static_cast(std::frexp(value, &exp) + * (uint64_t{1} << digits)); int64_t exponent = exp - digits; // see dlib link above; tries to shrink mantissa for more efficient @@ -108,7 +117,7 @@ uint64_t write(OutputStream& stream, double value) * @return the number of bytes used to write out the value */ template -uint64_t write(OutputStream& stream, const std::string& value) +uint64_t write(OutputStream& stream, util::string_view value) { for (const auto& c : value) { @@ -118,6 +127,23 @@ uint64_t write(OutputStream& stream, const std::string& value) return value.size() + 1; } +/** + * Writes an enumeration type in a packed representation. This determines + * the underlying type and serializes that. + * + * @param stream The stream to write to + * @param value The value to write + * @return the number of bytes used to write out the value + */ +template +typename std::enable_if::value, uint64_t>::type + write(OutputStream& stream, T value) +{ + auto val = static_cast::type>(value); + return write(stream, val); +} + + /** * Reads an unsigned integer from its packed representation. * @@ -125,8 +151,11 @@ uint64_t write(OutputStream& stream, const std::string& value) * @param value The element to write into * @return the number of bytes read */ -template -uint64_t read(InputStream& stream, uint64_t& value) +template +typename std::enable_if::value + && std::is_unsigned::value, + uint64_t>::type + read(InputStream& stream, T& value) { value = 0; uint64_t size = 0; @@ -134,7 +163,7 @@ uint64_t read(InputStream& stream, uint64_t& value) do { byte = stream.get(); - value |= static_cast(byte & 127) << (7 * size); + value |= static_cast(byte & 127) << (7 * size); ++size; } while (byte & 128); return size; @@ -151,10 +180,13 @@ uint64_t read(InputStream& stream, uint64_t& value) * @param value The element to write into * @return the number of bytes read */ -template -uint64_t read(InputStream& stream, int64_t& value) +template +typename std::enable_if::value + && std::is_signed::value, + uint64_t>::type + read(InputStream& stream, T& value) { - uint64_t elem; + typename std::make_unsigned::type elem; auto bytes = read(stream, elem); value = (elem >> 1) ^ (-(elem & 1)); @@ -169,8 +201,9 @@ uint64_t read(InputStream& stream, int64_t& value) * @param value The element to write into * @return the number of bytes read */ -template -uint64_t read(InputStream& stream, double& value) +template +typename std::enable_if::value, uint64_t>::type + read(InputStream& stream, T& value) { int64_t mantissa; int64_t exponent; @@ -196,6 +229,24 @@ uint64_t read(InputStream& stream, std::string& value) value += c; return value.size() + 1; } + +/** + * Reads an enum from its packed representation. This reads an integer of + * the underlying type and then sets the enum accordingly. + * + * @param stream The stream to read from + * @param value The element to write to + * @return the number of bytes read + */ +template +typename std::enable_if::value, uint64_t>::type + read(InputStream& stream, T& value) +{ + typename std::underlying_type::type val; + auto size = read(stream, val); + value = static_cast(val); + return size; +} } } } diff --git a/include/stats/dirichlet.tcc b/include/stats/dirichlet.tcc index 78c334c56..1f71f8233 100644 --- a/include/stats/dirichlet.tcc +++ b/include/stats/dirichlet.tcc @@ -6,7 +6,6 @@ #include "stats/dirichlet.h" #include "util/identifiers.h" #include "util/shim.h" -#include "io/binary.h" #include "io/packed.h" namespace meta diff --git a/src/classify/classifier/naive_bayes.cpp b/src/classify/classifier/naive_bayes.cpp index 6e0188c7a..9c66fded9 100644 --- a/src/classify/classifier/naive_bayes.cpp +++ b/src/classify/classifier/naive_bayes.cpp @@ -11,7 +11,6 @@ #if META_HAS_ZLIB #include "io/gzstream.h" #endif -#include "io/binary.h" #include "io/packed.h" namespace meta @@ -104,7 +103,7 @@ void naive_bayes::save(const std::string& prefix) const { const auto& label = dist.first; const auto& probs = dist.second; - io::write_binary(tp_out, static_cast(label)); + io::packed::write(tp_out, static_cast(label)); probs.save(tp_out); } class_probs_.save(cp_out); @@ -139,7 +138,7 @@ void naive_bayes::load(const std::string& prefix) for (uint64_t i = 0; i < size; ++i) { std::string label; - io::read_binary(tp_in, label); + io::packed::read(tp_in, label); term_probs_[class_label{label}].load(tp_in); } class_probs_.load(cp_in); diff --git a/src/index/metadata_writer.cpp b/src/index/metadata_writer.cpp index dd53893af..423954062 100644 --- a/src/index/metadata_writer.cpp +++ b/src/index/metadata_writer.cpp @@ -4,7 +4,6 @@ */ #include "index/metadata_writer.h" -#include "io/binary.h" #include "io/packed.h" namespace meta @@ -23,17 +22,19 @@ metadata_writer::metadata_writer(const std::string& prefix, uint64_t num_docs, // cast below is needed for OS X overload resolution byte_pos_ += io::packed::write(db_file_, static_cast(schema_.size() + 2)); - byte_pos_ += io::write_binary(db_file_, std::string{"length"}); - byte_pos_ += io::write_binary(db_file_, - corpus::metadata::field_type::UNSIGNED_INT); - byte_pos_ += io::write_binary(db_file_, std::string{"unique-terms"}); - byte_pos_ += io::write_binary(db_file_, - corpus::metadata::field_type::UNSIGNED_INT); + + byte_pos_ += io::packed::write(db_file_, std::string{"length"}); + byte_pos_ += io::packed::write(db_file_, + corpus::metadata::field_type::UNSIGNED_INT); + + byte_pos_ += io::packed::write(db_file_, "unique-terms"); + byte_pos_ += io::packed::write(db_file_, + corpus::metadata::field_type::UNSIGNED_INT); for (const auto& finfo : schema_) { - byte_pos_ += io::write_binary(db_file_, finfo.name); - byte_pos_ += io::write_binary(db_file_, finfo.type); + byte_pos_ += io::packed::write(db_file_, finfo.name); + byte_pos_ += io::packed::write(db_file_, finfo.type); } } @@ -69,7 +70,7 @@ void metadata_writer::write(doc_id d_id, uint64_t length, uint64_t num_unique, break; case corpus::metadata::field_type::STRING: - byte_pos_ += io::write_binary(db_file_, fld.str); + byte_pos_ += io::packed::write(db_file_, fld.str); break; } } diff --git a/src/parser/sr_parser.cpp b/src/parser/sr_parser.cpp index 8a3a5b3d7..6c8f23ab4 100644 --- a/src/parser/sr_parser.cpp +++ b/src/parser/sr_parser.cpp @@ -6,7 +6,7 @@ #include #include -#include "io/binary.h" +#include "io/packed.h" #include "logging/logger.h" #include "parallel/parallel_for.h" #include "parser/sr_parser.h" @@ -446,7 +446,7 @@ void sr_parser::save(const std::string& prefix) const std::ofstream model{prefix + "/parser.model", std::ios::binary}; #endif - io::write_binary(model, beam_size_); + io::packed::write(model, beam_size_); model_.save(model); } @@ -470,7 +470,7 @@ void sr_parser::load(std::istream& model) if (!model) throw sr_parser_exception{"model file not found"}; - io::read_binary(model, beam_size_); + io::packed::read(model, beam_size_); model_.load(model); } } diff --git a/src/parser/transition_map.cpp b/src/parser/transition_map.cpp index 8f70129e4..30c0a19f5 100644 --- a/src/parser/transition_map.cpp +++ b/src/parser/transition_map.cpp @@ -6,7 +6,7 @@ #include #include -#include "io/binary.h" +#include "io/packed.h" #include "parser/transition_map.h" #include "util/filesystem.h" @@ -39,7 +39,7 @@ void transition_map::load(std::istream& store) throw transition_map_exception{"missing transitions model file"}; uint64_t num_trans; - io::read_binary(store, num_trans); + io::packed::read(store, num_trans); if (!store) throw transition_map_exception{"malformed transitions model file"}; @@ -53,7 +53,7 @@ void transition_map::load(std::istream& store) "transitions written)"}; transition::type_t trans_type; - io::read_binary(store, trans_type); + io::packed::read(store, trans_type); util::optional trans; switch (trans_type) @@ -63,7 +63,7 @@ void transition_map::load(std::istream& store) case transition::type_t::UNARY: { std::string lbl; - io::read_binary(store, lbl); + io::packed::read(store, lbl); trans = transition{trans_type, class_label{lbl}}; break; } @@ -118,18 +118,17 @@ void transition_map::save(const std::string& prefix) const std::ofstream store{prefix + "/parser.trans", std::ios::binary}; #endif - uint64_t sze = transitions_.size(); - io::write_binary(store, sze); + io::packed::write(store, transitions_.size()); for (const auto& trans : transitions_) { - io::write_binary(store, trans.type()); + io::packed::write(store, trans.type()); switch (trans.type()) { case transition::type_t::REDUCE_L: case transition::type_t::REDUCE_R: case transition::type_t::UNARY: - io::write_binary(store, - static_cast(trans.label())); + io::packed::write( + store, static_cast(trans.label())); break; default: diff --git a/src/sequence/sequence_analyzer.cpp b/src/sequence/sequence_analyzer.cpp index e93108bf5..1d97abef2 100644 --- a/src/sequence/sequence_analyzer.cpp +++ b/src/sequence/sequence_analyzer.cpp @@ -6,7 +6,7 @@ #include #include #include -#include "io/binary.h" +#include "io/packed.h" #if META_HAS_ZLIB #include "io/gzstream.h" #endif @@ -54,7 +54,7 @@ void sequence_analyzer::load_feature_id_mapping(std::istream& input) throw exception{"missing feature id mapping"}; uint64_t num_keys; - io::read_binary(input, num_keys); + io::packed::read(input, num_keys); printing::progress progress{" > Loading feature mapping: ", num_keys}; num_keys = 0; while (input) @@ -62,8 +62,8 @@ void sequence_analyzer::load_feature_id_mapping(std::istream& input) progress(++num_keys); std::string key; feature_id value; - io::read_binary(input, key); - io::read_binary(input, value); + io::packed::read(input, key); + io::packed::read(input, value); feature_id_mapping_[key] = value; } } @@ -86,14 +86,13 @@ void sequence_analyzer::save(const std::string& prefix) const #else std::ofstream output{prefix + "/feature.mapping", std::ios::binary}; #endif - uint64_t sze = feature_id_mapping_.size(); - io::write_binary(output, sze); + io::packed::write(output, feature_id_mapping_.size()); uint64_t i = 0; for (const auto& pair : feature_id_mapping_) { progress(++i); - io::write_binary(output, pair.first); - io::write_binary(output, pair.second); + io::packed::write(output, pair.first); + io::packed::write(output, pair.second); } map::save_mapping(label_id_mapping_, prefix + "/label.mapping"); } From 46f4f3f8996ef988f3070bd8da43aa9be3736996 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 12 Sep 2015 19:32:06 -0500 Subject: [PATCH 284/481] Fix debug build (io::packed didn't like util::identifiers). --- include/io/packed.h | 4 +- include/util/comparable.h | 12 +++--- include/util/identifiers.h | 57 ++++++++++++++++++++++--- src/classify/classifier/naive_bayes.cpp | 2 +- 4 files changed, 59 insertions(+), 16 deletions(-) diff --git a/include/io/packed.h b/include/io/packed.h index 507f60eee..2b0d608cc 100644 --- a/include/io/packed.h +++ b/include/io/packed.h @@ -66,7 +66,8 @@ typename std::enable_if::value uint64_t>::type write(OutputStream& stream, T value) { - uint64_t elem = (value << 1) ^ (value >> 63); + typename std::make_unsigned::type elem + = (value << 1) ^ (value >> (sizeof(T) * 8 - 1)); return write(stream, elem); } @@ -143,7 +144,6 @@ typename std::enable_if::value, uint64_t>::type return write(stream, val); } - /** * Reads an unsigned integer from its packed representation. * diff --git a/include/util/comparable.h b/include/util/comparable.h index cc936e17d..bf5f81150 100644 --- a/include/util/comparable.h +++ b/include/util/comparable.h @@ -34,7 +34,7 @@ class comparable * @param rhs * @return whether lhs == rhs */ - friend bool operator==(const comparable& lhs, const comparable& rhs) + friend constexpr bool operator==(const comparable& lhs, const comparable& rhs) { return !(lhs.as_derived() < rhs.as_derived()) && !(rhs.as_derived() < lhs.as_derived()); @@ -48,7 +48,7 @@ class comparable * @param rhs * @return whether lhs != rhs */ - friend bool operator!=(const comparable& lhs, const comparable& rhs) + friend constexpr bool operator!=(const comparable& lhs, const comparable& rhs) { return !(lhs == rhs); } @@ -58,7 +58,7 @@ class comparable * @param rhs * @return whether lhs > rhs, as defined by their operator<. */ - friend bool operator>(const comparable& lhs, const comparable& rhs) + friend constexpr bool operator>(const comparable& lhs, const comparable& rhs) { return rhs.as_derived() < lhs.as_derived(); } @@ -69,7 +69,7 @@ class comparable * @return whether lhs <= rhs, as defined by their operator< and * comparable::operator==. */ - friend bool operator<=(const comparable& lhs, const comparable& rhs) + friend constexpr bool operator<=(const comparable& lhs, const comparable& rhs) { return lhs.as_derived() < rhs.as_derived() || lhs == rhs; } @@ -80,7 +80,7 @@ class comparable * @return whether lhs >= rhs, as defined by comparable::operator> and * comparable::operator==. */ - friend bool operator>=(const comparable& lhs, const comparable& rhs) + friend constexpr bool operator>=(const comparable& lhs, const comparable& rhs) { return lhs > rhs || lhs == rhs; } @@ -92,7 +92,7 @@ class comparable * * @return the Derived form of the current comparable */ - inline const Derived& as_derived() const + inline constexpr const Derived& as_derived() const { return static_cast(*this); } diff --git a/include/util/identifiers.h b/include/util/identifiers.h index fd058fd6c..cd2122fb0 100644 --- a/include/util/identifiers.h +++ b/include/util/identifiers.h @@ -14,6 +14,7 @@ #include // for std::hash #include "util/comparable.h" +#include "util/string_view.h" namespace meta { @@ -33,8 +34,8 @@ struct numeric template struct is_numeric { - const static constexpr bool value - = std::is_integral::value || std::is_base_of::value; + const static constexpr bool value = std::is_integral::value + || std::is_base_of::value; }; /** @@ -62,6 +63,8 @@ struct hash_wrapper : public Wrapped> template struct identifier : public comparable> { + using underlying_type = T; + /** * The underlying id for the identifier. */ @@ -73,8 +76,9 @@ struct identifier : public comparable> * * @param t the underlying type to convert into an identifier */ - explicit identifier(const T& t) : id_{t} + explicit constexpr identifier(const T& t) : id_{t} { + // nothing } /** @@ -129,7 +133,7 @@ struct identifier : public comparable> * * @return the base type representation for this identifier */ - operator const T&() const + constexpr operator const T&() const { return id_; } @@ -145,6 +149,20 @@ struct identifier : public comparable> return id_; } + /** + * Conversion to string_view. Enabled only if T is a std::string. + * @return the base type representation of this identifier, converted + * to a string_view + */ + template < + typename U = T, + typename + = typename std::enable_if::value>::type> + constexpr operator util::string_view() const + { + return id_; + } + /** * identifiers are comparable by their base types. This allows for * storage in comparison-based containers like std::map or std::set. @@ -153,7 +171,8 @@ struct identifier : public comparable> * @param rhs * @return whether lhs < rhs based on T::operator<. */ - inline friend bool operator<(const identifier& lhs, const identifier& rhs) + inline friend constexpr bool operator<(const identifier& lhs, + const identifier& rhs) { return static_cast(lhs) < static_cast(rhs); } @@ -311,12 +330,36 @@ struct hash> */ template size_t operator()( - const meta::util::identifier, - T>& to_hash) const + const meta::util::identifier, T>& + to_hash) const { return hash{}(static_cast(to_hash)); } }; + +/** + * A partial specialization for hash_wrapper types. + */ +template