Skip to content

Commit

Permalink
Merge pull request #41 from MighTguY/feature/accuracy
Browse files Browse the repository at this point in the history
 # This is a combination of 2 commits.
  • Loading branch information
MighTguY authored Feb 7, 2020
2 parents b3e63dc + 3ac94ef commit 711fc79
Show file tree
Hide file tree
Showing 7 changed files with 3,968 additions and 2 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,20 @@ Word deletes are generated with taking edit distance which is minimum of max edi

## [Solr Usage](symspell-solr/README.md)

## Accuracy Summary

> Indexed Docs: 3695
>Searches: 8060
| Spellcorrection Strategy | Accuracy | Failures | TP | TN | FP | FN |
|-------------------------- |:--------: |---------: |------ |----- |----- |------ |
| LUCENE | 78.96% | 21.04% | 5883 | 481 | 146 | 1550 |
| Vanilla SymSpell | 88.80% | 11.20% | 6888 | 269 | 358 | 545 |
| Weighted SymSpell | 75.74% | 24.26% | 5781 | 324 | 303 | 1652 |
| Qwerty Vanilla SymSpell | 88.57% | 11.43% | 6860 | 279 | 348 | 573 |
| Qwerty Weighted SymSpell | 75.36% | 24.64% | 5744 | 330 | 297 | 1689 |

## Benchmark Summary
We have done 3 runs each for 30k and 80k data set, which also includes results for each verbosity level.
After the runs the final benchmarking looks like:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ public static Set<String> edits(String word, double editDistance, Set<String> de
edits(delete, editDistance, deletedWords, maxEd);
}
}
deletedWords.add(word);
return deletedWords;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
package io.github.mightguy.spellcheck.symspell;

import io.github.mightguy.spellcheck.symspell.api.CharDistance;
import io.github.mightguy.spellcheck.symspell.api.DataHolder;
import io.github.mightguy.spellcheck.symspell.api.SpellChecker;
import io.github.mightguy.spellcheck.symspell.api.StringDistance;
import io.github.mightguy.spellcheck.symspell.common.DictionaryItem;
import io.github.mightguy.spellcheck.symspell.common.Murmur3HashFunction;
import io.github.mightguy.spellcheck.symspell.common.QwertyDistance;
import io.github.mightguy.spellcheck.symspell.common.SpellCheckSettings;
import io.github.mightguy.spellcheck.symspell.common.SuggestionItem;
import io.github.mightguy.spellcheck.symspell.common.WeightedDamerauLevenshteinDistance;
import io.github.mightguy.spellcheck.symspell.exception.SpellCheckException;
import io.github.mightguy.spellcheck.symspell.impl.InMemoryDataHolder;
import io.github.mightguy.spellcheck.symspell.impl.SymSpellCheck;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang3.time.StopWatch;
import org.junit.Test;

public class AccuracyTest {

private static String fullTestData = "full_test.txt";
// very verbose!!
private static boolean printFailures = false;

private static boolean acceptSecondHitAsSuccess = false;

public void run(SpellChecker spellChecker) throws IOException, SpellCheckException {

URL queryResourceUrl = this.getClass().getClassLoader().getResource(fullTestData);
CSVParser parser = CSVParser
.parse(queryResourceUrl, Charset.forName("UTF-8"),
CSVFormat.DEFAULT.withDelimiter(':'));

Map<String, String> tpCandidates = new HashMap<>();
Map<String, String> fpCandidates = new HashMap<>();

// index
StopWatch stopWatch = new StopWatch();
stopWatch.start();
int indexCount = 0;
Iterator<CSVRecord> csvIterator = parser.iterator();
while (csvIterator.hasNext()) {
// 0 = correct word
// 1 = true if this is a desired match,
// false if this is a false-positive match
// 2 = comma separated list of similar word
CSVRecord csvRecord = csvIterator.next();
Boolean match = Boolean.valueOf(csvRecord.get(1));
if (match) {
appendToList(tpCandidates, csvRecord);
} else {
if (csvRecord.get(1).equals(csvRecord.get(0))) {
System.out.println("WRONG: " + csvRecord.get(1) + "," + csvRecord.get(0) + ",false");
}
appendToList(fpCandidates, csvRecord);
}

spellChecker.getDataHolder().addItem(new DictionaryItem(csvRecord.get(0), 1d, 0d));
indexCount++;
}

stopWatch.stop();
long indexTime = stopWatch.getTime();

stopWatch.reset();
stopWatch.start();

// for each spellTestSetEntry do all searches
int success = 0;
int fail = 0;
int truePositives = 0;
int trueNegatives = 0;
int falsePositives = 0;
int falseNegatives = 0;
int count = 0;

for (Entry<String, String> candidate : tpCandidates.entrySet()) {
List<SuggestionItem> results = spellChecker.lookupCompound(candidate.getKey());
Collections.sort(results);
// first or second match count as success
if (isMatch(candidate, results)) {
success++;
truePositives++;
} else {
if (printFailures) {
System.out.println(
count + ": '" + candidate.getValue() + "' not found by search for " + candidate
.getKey());
if (results.size() > 0) {
System.out.println("found '" + results.get(0)
+ (results.size() > 1 ? "' and '" + results.get(1) : "")
+ "' instead");
}
System.out.println();
}
fail++;
falseNegatives++;
}
count++;
}

for (Entry<String, String> candidate : fpCandidates.entrySet()) {
List<SuggestionItem> results = spellChecker.lookupCompound(candidate.getKey());
Collections.sort(results);
// first or second match count as success
if (isMatch(candidate, results) && !candidate.getKey().equals(results.get(0))) {
fail++;
falsePositives++;
if (printFailures) {
System.out
.println("false-positive: found '" + results.get(0) + "' by search for '" + candidate
.getKey() + "'");
if (results.size() > 1 && acceptSecondHitAsSuccess) {
System.out.println(" + found '" + results.get(1) + "' as well'");
}
System.out.println();
}
} else {
success++;
trueNegatives++;
}
count++;
}

stopWatch.stop();

System.out.println("indexed " + indexCount + " words in " + indexTime + "ms");
System.out.println(count + " searches");
System.out.println(stopWatch.getTime() + "ms => "
+ String.format("%1$.3f searches/ms", ((double) count / (stopWatch.getTime()))));
System.out.println();
System.out.println(
success + " success / accuracy => " + String.format("%.2f%%", (100.0 * success / count)));
System.out.println(truePositives + " true-positives");
System.out.println(trueNegatives + " true-negatives (?)");
System.out.println();
System.out.println(fail + " fail => " + String.format("%.2f%%", (100.0 * fail / count)));
System.out.println(falseNegatives + " false-negatives");
System.out.println(falsePositives + " false-positives");
System.out.println();


}

private void appendToList(Map<String, String> tpCandidates, CSVRecord csvRecord) {
String targetWord = csvRecord.get(0);
String[] variants = csvRecord.get(2).split(",");
for (String variant : variants) {
tpCandidates.put(variant, targetWord);
}
}

private static boolean isMatch(Entry<String, String> candidate, List<SuggestionItem> results) {
return (results.size() > 0 && results.get(0).getTerm().trim().equals(candidate.getValue()))
|| (results.size() > 0 && results.get(0).getTerm().trim().equals(candidate.getKey()))
|| (acceptSecondHitAsSuccess
&& results.size() > 1
&& results.get(1).getTerm().equals(candidate.getValue()));
}

@Test
public void testAccuracy() throws IOException, SpellCheckException {

AccuracyTest accuracyTest = new AccuracyTest();

System.out.println("========= Basic =============================");
//Basic
SpellCheckSettings spellCheckSettings = SpellCheckSettings.builder()
.countThreshold(0)
.prefixLength(40)
.maxEditDistance(2.0d).build();

DataHolder dataHolder = new InMemoryDataHolder(spellCheckSettings,
new Murmur3HashFunction());

SpellChecker spellChecker = new SymSpellCheck(dataHolder,
accuracyTest.getStringDistance(spellCheckSettings, null),
spellCheckSettings);
accuracyTest.run(spellChecker);
System.out.println("==================================================");

//Weighted
System.out.println("========= Weighted =============================");
spellCheckSettings = SpellCheckSettings.builder()
.deletionWeight(1.01f)
.insertionWeight(0.9f)
.replaceWeight(0.7f)
.transpositionWeight(1.0f)
.countThreshold(0)
.prefixLength(40)
.maxEditDistance(2.0d).build();

dataHolder = new InMemoryDataHolder(spellCheckSettings,
new Murmur3HashFunction());
SpellChecker weightedSpellChecker = new SymSpellCheck(dataHolder,
accuracyTest.getStringDistance(spellCheckSettings, null),
spellCheckSettings);
accuracyTest.run(weightedSpellChecker);
System.out.println("==================================================");


//Qwerty
System.out.println("========= Qwerty =============================");
spellCheckSettings = SpellCheckSettings.builder()
.countThreshold(0)
.prefixLength(40)
.maxEditDistance(2.0d).build();
dataHolder = new InMemoryDataHolder(spellCheckSettings,
new Murmur3HashFunction());
SpellChecker keyboardSpellChecker = new SymSpellCheck(dataHolder,
accuracyTest.getStringDistance(spellCheckSettings, new QwertyDistance()),
spellCheckSettings);
accuracyTest.run(keyboardSpellChecker);
System.out.println("==================================================");

//QwertyWeighted
System.out.println("========= QwertyWeighted =============================");
spellCheckSettings = SpellCheckSettings.builder()
.deletionWeight(1.01f)
.insertionWeight(0.9f)
.replaceWeight(0.7f)
.transpositionWeight(1.0f)
.countThreshold(0)
.prefixLength(40)
.maxEditDistance(2.0d).build();
dataHolder = new InMemoryDataHolder(spellCheckSettings,
new Murmur3HashFunction());
SpellChecker keyboardWeightedSpellChecker = new SymSpellCheck(dataHolder,
accuracyTest.getStringDistance(spellCheckSettings, new QwertyDistance()),
spellCheckSettings);
accuracyTest.run(keyboardWeightedSpellChecker);
System.out.println("==================================================");
}

private StringDistance getStringDistance(SpellCheckSettings spellCheckSettings,
CharDistance charDistance) {
return new WeightedDamerauLevenshteinDistance(spellCheckSettings.getDeletionWeight(),
spellCheckSettings.getInsertionWeight(),
spellCheckSettings.getReplaceWeight(),
spellCheckSettings.getTranspositionWeight(), charDistance);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public void testSpellChecker() {
public void testSpellDeletes() {
Set<String> del = SpellHelper.getEditDeletes("a", 2.0, 0, 1);
Assert.assertNotNull(del);
Assert.assertEquals(1, del.size());
Assert.assertEquals(2, del.size());

Set<String> del1 = SpellHelper.edits("", 2.0, del, 2.0);
Assert.assertNotNull(del);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,4 +236,5 @@ public void testWordBreak() throws Exception {
}



}
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ public void testSingleWordCorrection() throws SpellCheckException {
SymSpellTest.assertTypoAndCorrected(symSpellCheck,
"slatew", "slate", 2);
SymSpellTest.assertTypoAndCorrected(symSpellCheck,
"ith", "with", 2);
"ith", "it", 2);
SymSpellTest.assertTypoAndCorrected(symSpellCheck,
"plety", "plenty", 2);
SymSpellTest.assertTypoAndCorrected(symSpellCheck,
Expand Down
Loading

0 comments on commit 711fc79

Please sign in to comment.