From e297606ce30ca0092fe82c2f596b0df359803aad Mon Sep 17 00:00:00 2001 From: Chase Date: Tue, 17 Jan 2017 11:26:54 -0500 Subject: [PATCH 1/4] don't train or untrain empty word hashes --- lib/classifier-reborn/bayes.rb | 8 ++++++-- lib/classifier-reborn/extensions/hasher.rb | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/classifier-reborn/bayes.rb b/lib/classifier-reborn/bayes.rb index 428d0b9..0740e9d 100644 --- a/lib/classifier-reborn/bayes.rb +++ b/lib/classifier-reborn/bayes.rb @@ -73,7 +73,9 @@ def train(category, text) @backend.update_category_training_count(category, 1) @backend.update_total_trainings(1) - Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count| + word_hash = Hasher.word_hash(text, @language, @enable_stemmer) + return if word_hash.length == 0 + word_hash.each do |word, count| @backend.update_category_word_frequency(category, word, count) @backend.update_category_word_count(category, count) @backend.update_total_words(count) @@ -91,7 +93,9 @@ def untrain(category, text) category = CategoryNamer.prepare_name(category) @backend.update_category_training_count(category, -1) @backend.update_total_trainings(-1) - Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count| + word_hash = Hasher.word_hash(text, @language, @enable_stemmer) + return if word_hash.length == 0 + word_hash.each do |word, count| next if @backend.total_words < 0 orig = @backend.category_word_frequency(category, word) || 0 @backend.update_category_word_frequency(category, word, -count) diff --git a/lib/classifier-reborn/extensions/hasher.rb b/lib/classifier-reborn/extensions/hasher.rb index 398a46f..c1bf1de 100644 --- a/lib/classifier-reborn/extensions/hasher.rb +++ b/lib/classifier-reborn/extensions/hasher.rb @@ -21,7 +21,7 @@ def word_hash(str, language = 'en', enable_stemmer = true) # Return a word hash without extra punctuation or short symbols, just stemmed words def clean_word_hash(str, language = 'en', enable_stemmer = true) - word_hash_for_words str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer + word_hash_for_words(str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer) end def word_hash_for_words(words, language = 'en', enable_stemmer = true) From e66c7122a9590c57d5475ad95879ddfe0cedbb44 Mon Sep 17 00:00:00 2001 From: Chase Date: Tue, 17 Jan 2017 12:10:26 -0500 Subject: [PATCH 2/4] first pass on checking word hash first --- lib/classifier-reborn/bayes.rb | 35 ++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/lib/classifier-reborn/bayes.rb b/lib/classifier-reborn/bayes.rb index 0740e9d..ddc96ad 100644 --- a/lib/classifier-reborn/bayes.rb +++ b/lib/classifier-reborn/bayes.rb @@ -60,6 +60,8 @@ def initialize(*args) # b.train "that", "That text" # b.train "The other", "The other text" def train(category, text) + word_hash = Hasher.word_hash(text, @language, @enable_stemmer) + return if word_hash.empty? category = CategoryNamer.prepare_name(category) # Add the category dynamically or raise an error @@ -73,8 +75,6 @@ def train(category, text) @backend.update_category_training_count(category, 1) @backend.update_total_trainings(1) - word_hash = Hasher.word_hash(text, @language, @enable_stemmer) - return if word_hash.length == 0 word_hash.each do |word, count| @backend.update_category_word_frequency(category, word, count) @backend.update_category_word_count(category, count) @@ -90,11 +90,11 @@ def train(category, text) # b.train :this, "This text" # b.untrain :this, "This text" def untrain(category, text) + word_hash = Hasher.word_hash(text, @language, @enable_stemmer) + return if word_hash.empty? category = CategoryNamer.prepare_name(category) @backend.update_category_training_count(category, -1) @backend.update_total_trainings(-1) - word_hash = Hasher.word_hash(text, @language, @enable_stemmer) - return if word_hash.length == 0 word_hash.each do |word, count| next if @backend.total_words < 0 orig = @backend.category_word_frequency(category, word) || 0 @@ -116,18 +116,25 @@ def untrain(category, text) def classifications(text) score = {} word_hash = Hasher.word_hash(text, @language, @enable_stemmer) - category_keys.each do |category| - score[category.to_s] = 0 - total = (@backend.category_word_count(category) || 1).to_f - word_hash.each do |word, _count| - s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1 - score[category.to_s] += Math.log(s / total) + if word_hash.empty? + category_keys.each do |category| + score[category.to_s] = Float::INFINITY + end + score + else + category_keys.each do |category| + score[category.to_s] = 0 + total = (@backend.category_word_count(category) || 1).to_f + word_hash.each do |word, _count| + s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1 + score[category.to_s] += Math.log(s / total) + end + # now add prior probability for the category + s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1 + score[category.to_s] += Math.log(s / @backend.total_trainings.to_f) end - # now add prior probability for the category - s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1 - score[category.to_s] += Math.log(s / @backend.total_trainings.to_f) + score end - score end # Returns the classification of the provided +text+, which is one of the From 5a4eb3b2412f939532a45351804923004215aabf Mon Sep 17 00:00:00 2001 From: Chase Date: Tue, 17 Jan 2017 12:36:38 -0500 Subject: [PATCH 3/4] simplified classifications and fixed test --- lib/classifier-reborn/bayes.rb | 25 ++++++++++++------------- test/bayes/bayesian_common_tests.rb | 14 +++++++------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/lib/classifier-reborn/bayes.rb b/lib/classifier-reborn/bayes.rb index ddc96ad..fd64787 100644 --- a/lib/classifier-reborn/bayes.rb +++ b/lib/classifier-reborn/bayes.rb @@ -120,21 +120,20 @@ def classifications(text) category_keys.each do |category| score[category.to_s] = Float::INFINITY end - score - else - category_keys.each do |category| - score[category.to_s] = 0 - total = (@backend.category_word_count(category) || 1).to_f - word_hash.each do |word, _count| - s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1 - score[category.to_s] += Math.log(s / total) - end - # now add prior probability for the category - s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1 - score[category.to_s] += Math.log(s / @backend.total_trainings.to_f) + return score + end + category_keys.each do |category| + score[category.to_s] = 0 + total = (@backend.category_word_count(category) || 1).to_f + word_hash.each do |word, _count| + s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1 + score[category.to_s] += Math.log(s / total) end - score + # now add prior probability for the category + s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1 + score[category.to_s] += Math.log(s / @backend.total_trainings.to_f) end + score end # Returns the classification of the provided +text+, which is one of the diff --git a/test/bayes/bayesian_common_tests.rb b/test/bayes/bayesian_common_tests.rb index aea9468..38f26f6 100644 --- a/test/bayes/bayesian_common_tests.rb +++ b/test/bayes/bayesian_common_tests.rb @@ -71,7 +71,7 @@ def test_classification end def test_classification_with_threshold - b = threshold_classifier('Digit') + b = threshold_classifier('Number') assert_equal 1, b.categories.size refute b.threshold_enabled? @@ -79,15 +79,15 @@ def test_classification_with_threshold assert b.threshold_enabled? assert_equal 0.0, b.threshold # default - b.threshold = -7.0 + b.threshold = -4.0 - 10.times do |a_number| - b.train_digit(a_number.to_s) - b.train_digit(a_number.to_s) + ['one', 'two', 'three', 'four', 'five'].each do |a_number| + b.train_number(a_number) + b.train_number(a_number) end - 10.times do |a_number| - assert_equal 'Digit', b.classify(a_number.to_s) + ['one', 'two', 'three', 'four', 'five'].each do |a_number| + assert_equal 'Number', b.classify(a_number) end refute b.classify('xyzzy') From 3c4ff8f3b7af3eef2deebbbf1ea6293c74078c0f Mon Sep 17 00:00:00 2001 From: Chase Date: Tue, 17 Jan 2017 15:03:25 -0500 Subject: [PATCH 4/4] apply test patch --- test/bayes/bayesian_integration_test.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/bayes/bayesian_integration_test.rb b/test/bayes/bayesian_integration_test.rb index 8996ab6..a7664b8 100644 --- a/test/bayes/bayesian_integration_test.rb +++ b/test/bayes/bayesian_integration_test.rb @@ -57,7 +57,7 @@ def classification_scores(classifier) @testing_set.collect do |line| parts = line.strip.split("\t") result, score = classifier.classify_with_score(parts.last) - "#{result}:#{score}" + score.infinite? ? "irrelevant" : "#{result}:#{score}" end end end