From af9c421f9a61171b5f112851ddc1091c0168e37a Mon Sep 17 00:00:00 2001 From: Lucky Sharma Date: Sat, 1 Feb 2020 02:56:25 +0530 Subject: [PATCH] Benchmarking using JMH --- pom.xml | 24 +- symspell-benchmark/README.md | 178 -------------- symspell-benchmark/pom.xml | 41 ---- .../CustomizedSymspellSearchBenchmark.java | 84 ------- .../symspell/benchmark/SymSpellBenchMark.java | 223 ------------------ .../benchmark/util/BenchmarkHelper.java | 88 ------- .../github/symspell/benchmark/util/Query.java | 13 - .../SpellCheckerConsole.java | 6 + symspell-lib/pom.xml | 13 + .../symspell/common/QwertyDistance.java | 3 + .../symspell/common/SpellHelper.java | 6 + .../symspell/common/SuggestionItem.java | 6 + .../WeightedDamerauLevenshteinDistance.java | 8 + .../exception/SpellCheckException.java | 28 +++ .../symspell/benchmark/MemoryProfiler.java | 45 ++++ .../benchmark/SymSpellIndexBenchMark.java | 141 +++++++++++ .../benchmark/SymSpellSearchBenchMark.java | 168 +++++++++++++ .../frequency_dictionary_en_30_000.txt | 0 .../frequency_dictionary_en_500_000.txt | 0 .../test}/resources/noisy_query_en_1000.txt | 0 .../eventlistner/CustomSpellCheckListner.java | 17 ++ .../solr/utils/SearchRequestUtil.java | 30 +++ 22 files changed, 493 insertions(+), 629 deletions(-) delete mode 100644 symspell-benchmark/README.md delete mode 100644 symspell-benchmark/pom.xml delete mode 100644 symspell-benchmark/src/main/java/io/github/symspell/benchmark/CustomizedSymspellSearchBenchmark.java delete mode 100644 symspell-benchmark/src/main/java/io/github/symspell/benchmark/SymSpellBenchMark.java delete mode 100644 symspell-benchmark/src/main/java/io/github/symspell/benchmark/util/BenchmarkHelper.java delete mode 100644 symspell-benchmark/src/main/java/io/github/symspell/benchmark/util/Query.java create mode 100644 symspell-lib/src/test/java/io/github/mightguy/spellcheck/symspell/benchmark/MemoryProfiler.java create mode 100644 symspell-lib/src/test/java/io/github/mightguy/spellcheck/symspell/benchmark/SymSpellIndexBenchMark.java create mode 100644 symspell-lib/src/test/java/io/github/mightguy/spellcheck/symspell/benchmark/SymSpellSearchBenchMark.java rename {symspell-benchmark/src/main => symspell-lib/src/test}/resources/frequency_dictionary_en_30_000.txt (100%) rename {symspell-benchmark/src/main => symspell-lib/src/test}/resources/frequency_dictionary_en_500_000.txt (100%) rename {symspell-benchmark/src/main => symspell-lib/src/test}/resources/noisy_query_en_1000.txt (100%) diff --git a/pom.xml b/pom.xml index 424b030..f853f76 100644 --- a/pom.xml +++ b/pom.xml @@ -20,7 +20,7 @@ 1.8 3.1 2.5.2 - 3.0.0 + 3.1.0 8.29 google_checks.xml checkstyle-suppressions.xml @@ -107,6 +107,27 @@ ${solr.version} test + + org.openjdk.jmh + jmh-core + ${jmh.version} + + + org.openjdk.jmh + jmh-generator-annprocess + ${jmh.version} + provided + + + org.apache.commons + commons-csv + 1.3 + + + io.github.mightguy + symspell-lib + ${parent.version} + @@ -422,7 +443,6 @@ symspell-lib symspell-console symspell-solr - symspell-benchmark diff --git a/symspell-benchmark/README.md b/symspell-benchmark/README.md deleted file mode 100644 index 53e9120..0000000 --- a/symspell-benchmark/README.md +++ /dev/null @@ -1,178 +0,0 @@ -# Benchmarking results -We have done benchmarking of the Vannila Symspell on a Mac(System details are mentioned below). -For benchmarking we have done benchmarking with the dataset of `30k` and `80k` words with edit distance of `[1, 2, 3]`. -Prefix length was constant to `7`. -Benchmarking also contain results for the Verbosity levels of **TOP**, **CLOSEST** and **ALL** - - -## Benchmark Summary -We have done 3 runs each for 30k and 80k data set, which also includes results for each verbosity level. -After the runs the final benchmarking looks like: -``` -Average Precalculation time instance 30843.33 ms -Average Lookup time instance 138141.09296296295 ns ~ 0.03814 ms -Total Lookup results instance 648092 -``` -Benchmark Summary is as follows: - -### Run 1 -``` -DataSize : 30,000 -Queries : 1000 -PrefixLength : 7 -MaxEditDistance : 1 - -Precalculation Time take : 4.017 s -``` - -Verbosity : __TOP__ -``` -Lookup : 622 -Response Time : 12684 ns/op -``` -Verbosity : __CLOSEST__ -``` -Lookup : 1610 -Response Time : 8646 ns/op -``` -Verbosity : __ALL__ -``` -Lookup : 4693 -Response Time : 14304 ns/op -``` - -### Run 2 - -``` -DataSize : 82,761 -Queries : 1000 -PrefixLength : 7 -MaxEditDistance : 1 - -Precalculation Time take : 4.653 s -``` - -Verbosity : __TOP__ -``` -Lookup : 635 -Response Time : 5740 ns/op -``` -Verbosity : __CLOSEST__ -``` -Lookup : 2347 -Response Time : 5686 ns/op -``` -Verbosity : __ALL__ -``` -Lookup : 6546 -Response Time : 13159 ns/op -``` - -### Run 3 -``` -DataSize : 30,000 -Queries : 1000 -PrefixLength : 7 -MaxEditDistance : 2 - -Precalculation Time take : 4.121 s -``` - -Verbosity : __TOP__ -``` -Lookup : 850 -Response Time : 17238 ns/op -``` -Verbosity : __CLOSEST__ -``` -Lookup : 2876 -Response Time : 21163 ns/op -``` -Verbosity : __ALL__ -``` -Lookup : 37058 -Response Time : 88658 ns/op -``` - -### Run 4 - -``` -DataSize : 82,761 -Queries : 1000 -PrefixLength : 7 -MaxEditDistance : 2 - -Precalculation Time take : 62.863 s -``` - -Verbosity : __TOP__ -``` -Lookup : 858 -Response Time : 34335 ns/op -``` -Verbosity : __CLOSEST__ -``` -Lookup : 4156 -Response Time : 146604 ns/op -``` -Verbosity : __ALL__ -``` -Lookup : 48262 -Response Time : 262154 ns/op -``` - -### Run 5 - -``` -DataSize : 30,000 -Queries : 1000 -PrefixLength : 7 -MaxEditDistance : 3 - -Precalculation Time take : 8.416 s -``` - -Verbosity : __TOP__ -``` -Lookup : 914 -Response Time : 144562 ns/op -``` -Verbosity : __CLOSEST__ -``` -Lookup : 3165 -Response Time : 37652 ns/op -``` -Verbosity : __ALL__ -``` -Lookup : 193833 -Response Time : 376399 ns/op -``` - -### Run 6 - -``` -DataSize : 82,761 -Queries : 1000 -PrefixLength : 7 -MaxEditDistance : 1 - -Precalculation Time take : 59,105 s -``` - -Verbosity : __TOP__ -``` -Lookup : 920 -Response Time : 40069 ns/op -``` -Verbosity : __CLOSEST__ -``` -Lookup : 4443 -Response Time : 40896 ns/op -``` -Verbosity : __ALL__ -``` -Lookup : 333931 -Response Time : 1427456 ns/op -``` - - diff --git a/symspell-benchmark/pom.xml b/symspell-benchmark/pom.xml deleted file mode 100644 index c11cb39..0000000 --- a/symspell-benchmark/pom.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - - symspell - io.github.mightguy - 6.6-SNAPSHOT - - 4.0.0 - - symspell-benchmark - - - - org.projectlombok - lombok - - - org.openjdk.jmh - jmh-core - ${jmh.version} - - - org.openjdk.jmh - jmh-generator-annprocess - ${jmh.version} - provided - - - org.apache.commons - commons-csv - 1.3 - - - io.github.mightguy - symspell-lib - ${parent.version} - - - \ No newline at end of file diff --git a/symspell-benchmark/src/main/java/io/github/symspell/benchmark/CustomizedSymspellSearchBenchmark.java b/symspell-benchmark/src/main/java/io/github/symspell/benchmark/CustomizedSymspellSearchBenchmark.java deleted file mode 100644 index 6cf6c74..0000000 --- a/symspell-benchmark/src/main/java/io/github/symspell/benchmark/CustomizedSymspellSearchBenchmark.java +++ /dev/null @@ -1,84 +0,0 @@ -package io.github.symspell.benchmark; - -import io.github.mightguy.spellcheck.symspell.api.SpellChecker; -import io.github.mightguy.spellcheck.symspell.common.Verbosity; -import io.github.mightguy.spellcheck.symspell.exception.SpellCheckException; -import io.github.symspell.benchmark.util.BenchmarkHelper; -import io.github.symspell.benchmark.util.Query; -import java.io.IOException; -import java.util.List; -import java.util.concurrent.TimeUnit; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.OutputTimeUnit; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.infra.Blackhole; -import org.openjdk.jmh.profile.GCProfiler; -import org.openjdk.jmh.runner.Runner; -import org.openjdk.jmh.runner.RunnerException; -import org.openjdk.jmh.runner.options.Options; -import org.openjdk.jmh.runner.options.OptionsBuilder; - - -public class CustomizedSymspellSearchBenchmark { - - @State(Scope.Benchmark) - public static class Data { - - public SpellChecker spellChecker; - public List queries; - public BenchmarkHelper benchmarkHelper; - - - @Setup - public void setUp() throws IOException, SpellCheckException { - benchmarkHelper = new BenchmarkHelper("frequency_dictionary_en_500_000.txt", - "noisy_query_en_1000.txt"); - spellChecker = benchmarkHelper.getDefaultSymSpellChecker(); - queries = benchmarkHelper.getQueries(); - benchmarkHelper.index(spellChecker); - } - } - - @State(Scope.Thread) - public static class Iterator { - - private int n = 0; - - Query getNextQuery(List queries) { - if (n >= queries.size()) { - n = 0; - } - return queries.get(n++); - } - } - - - @Benchmark - @BenchmarkMode(Mode.AverageTime) - @OutputTimeUnit(TimeUnit.MILLISECONDS) - public void testSearchBenchMark(Data data, Iterator i, Blackhole blackhole) - throws SpellCheckException { - Query query = null; - while (query == null) { - query = i.getNextQuery(data.queries); - } - boolean res = data.benchmarkHelper.testResult(query, data.spellChecker, Verbosity.TOP); - blackhole.consume(res); - } - - - - public static void main(String[] args) throws RunnerException { - Options opt = new OptionsBuilder() - .include(CustomizedSymspellSearchBenchmark.class.getSimpleName()) - .warmupIterations(0) - .measurementIterations(1) - .forks(1) - .build(); - new Runner(opt).run(); - } -} diff --git a/symspell-benchmark/src/main/java/io/github/symspell/benchmark/SymSpellBenchMark.java b/symspell-benchmark/src/main/java/io/github/symspell/benchmark/SymSpellBenchMark.java deleted file mode 100644 index bcc90bc..0000000 --- a/symspell-benchmark/src/main/java/io/github/symspell/benchmark/SymSpellBenchMark.java +++ /dev/null @@ -1,223 +0,0 @@ -package io.github.symspell.benchmark; - -import io.github.mightguy.spellcheck.symspell.api.DataHolder; -import io.github.mightguy.spellcheck.symspell.api.SpellChecker; -import io.github.mightguy.spellcheck.symspell.api.StringDistance; -import io.github.mightguy.spellcheck.symspell.common.DictionaryItem; -import io.github.mightguy.spellcheck.symspell.common.Murmur3HashFunction; -import io.github.mightguy.spellcheck.symspell.common.SpellCheckSettings; -import io.github.mightguy.spellcheck.symspell.common.SuggestionItem; -import io.github.mightguy.spellcheck.symspell.common.Verbosity; -import io.github.mightguy.spellcheck.symspell.common.WeightedDamerauLevenshteinDistance; -import io.github.mightguy.spellcheck.symspell.exception.SpellCheckException; -import io.github.mightguy.spellcheck.symspell.impl.InMemoryDataHolder; -import io.github.mightguy.spellcheck.symspell.impl.SymSpellCheck; -import java.io.IOException; -import java.net.URL; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.TimeUnit; -import org.apache.commons.csv.CSVFormat; -import org.apache.commons.csv.CSVParser; -import org.apache.commons.csv.CSVRecord; -import org.apache.commons.lang3.time.StopWatch; -import org.junit.Assert; - -public class SymSpellBenchMark { - - String[] dataFiles = { -// "frequency_dictionary_en_30_000.txt", -// "frequency_dictionary_en_82_765.txt", - "frequency_dictionary_en_500_000.txt" - }; - - String[] queryFiles = { - "noisy_query_en_1000.txt" - }; - - SpellCheckSettings basicSpellCheckerSettings = getDefaultSymSpellCheckerSettings(); - SpellCheckSettings custmoizedSpellCheckerSettings = getDefaultSymSpellCheckerSettings(); - StringDistance basicSpellCheckerDistance = getDefaultSymSpellCheckerDistance(); - - private StringDistance getDefaultSymSpellCheckerDistance() { - return new WeightedDamerauLevenshteinDistance(basicSpellCheckerSettings.getDeletionWeight(), - basicSpellCheckerSettings.getInsertionWeight(), - basicSpellCheckerSettings.getReplaceWeight(), - basicSpellCheckerSettings.getTranspositionWeight(), null); - } - - public SpellCheckSettings getDefaultSymSpellCheckerSettings() { - SpellCheckSettings spellCheckSettings = SpellCheckSettings.builder().maxEditDistance(5).build(); - - return spellCheckSettings; - - } - - - void warmUp() throws IOException, SpellCheckException { - DataHolder dataHolder = new InMemoryDataHolder(basicSpellCheckerSettings, - new Murmur3HashFunction()); - indexData(dataFiles[0], dataHolder); - SpellChecker basicSpellChecker = new SymSpellCheck(dataHolder, - basicSpellCheckerDistance, - basicSpellCheckerSettings); - - List suggestionItemList = basicSpellChecker - .lookup("hockie", Verbosity.ALL, 1); - Collections.sort(suggestionItemList); - List compundSuggestions = basicSpellChecker - .lookupCompound("hockie", 1); - Assert.assertNotNull(suggestionItemList); - Assert.assertNotNull(compundSuggestions); - } - - private void indexData(String dataResourceName, DataHolder dataHolder) - throws IOException, SpellCheckException { - URL resourceUrl = this.getClass().getClassLoader().getResource(dataResourceName); - CSVParser parser = CSVParser - .parse(resourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(' ')); - Iterator csvIterator = parser.iterator(); - while (csvIterator.hasNext()) { - CSVRecord csvRecord = csvIterator.next(); - dataHolder - .addItem(new DictionaryItem(csvRecord.get(0), Double.valueOf(csvRecord.get(1)), 0d)); - } - } - - public static void main(String[] args) throws IOException, SpellCheckException { - SymSpellBenchMark symSpellBenchMark = new SymSpellBenchMark(); -// symSpellBenchMark.warmUp(); - symSpellBenchMark.benchmarkPrecalculationLookup(); - } - - public void benchmarkPrecalculationLookup() throws IOException, SpellCheckException { - int resultNumber = 0; - int repetitions = 1000; - int totalLoopCount = 0; - long totalMatches = 0; - long totalOrigMatches = 0; - double totalLoadTime, totalMem, totalLookupTime, totalOrigLoadTime, totalOrigMem, totalOrigLookupTime; - totalLoadTime = totalMem = totalLookupTime = totalOrigLoadTime = totalOrigMem = totalOrigLookupTime = 0; - long totalRepetitions = 0; - List query1K = buildQuery1K(); - StopWatch stopWatch = StopWatch.createStarted(); - for (int maxEditDistance = 1; maxEditDistance <= 3; maxEditDistance++) { - //benchmark dictionary precalculation size and time - //maxEditDistance=1/2/3; prefixLength=5/6/7; dictionary=30k/82k/500k; class=instantiated/static - for (int i = 0; i < dataFiles.length; i++) { - totalLoopCount++; - long memSize = Runtime.getRuntime().totalMemory(); - stopWatch.reset(); - stopWatch.start(); - //SymspellInstance: - SpellCheckSettings spellCheckSettings = SpellCheckSettings.builder() - .maxEditDistance(maxEditDistance) - .build(); - long prefixLength = spellCheckSettings.getPrefixLength(); - DataHolder dataHolder = new InMemoryDataHolder(basicSpellCheckerSettings, - new Murmur3HashFunction()); - - //Indexing data - indexData(dataFiles[i], dataHolder); - stopWatch.stop(); - long memDelta = Runtime.getRuntime().totalMemory() - memSize; - totalLoadTime += stopWatch.getTime(TimeUnit.MILLISECONDS); - totalMem += memDelta / 1024.0 / 1024.0; - System.out.println( - "Precalculation instance " + stopWatch.getTime(TimeUnit.MILLISECONDS) + "ms " + ( - memDelta / 1024.0 / 1024.0) - + "MB " + dataHolder.getSize() + " words " + " MaxEditDistance=" + maxEditDistance - + " prefixLength=" + prefixLength + " dict=" + dataFiles[i]); - - SpellChecker basicSpellChecker = new SymSpellCheck(dataHolder, - basicSpellCheckerDistance, - basicSpellCheckerSettings); - -// for (Verbosity verbosity : Verbosity.values()) { - Verbosity verbosity = Verbosity.TOP; -// //instantiated exact -// stopWatch.reset(); -// stopWatch.start(); -// for (int round = 0; round < repetitions; round++) { -// resultNumber = basicSpellChecker.lookup("different", verbosity, maxEditDistance) -// .size(); -// } -// stopWatch.stop(); -// totalLookupTime += stopWatch.getTime(TimeUnit.NANOSECONDS); -// totalMatches += resultNumber; -// System.out.println("Lookup instance " + resultNumber + " results " + ( -// stopWatch.getTime(TimeUnit.NANOSECONDS) / repetitions) -// + "ns/op verbosity=" + verbosity + " query=exact"); -// totalRepetitions += repetitions; -// -// //instantiated non-exact -// stopWatch.reset(); -// stopWatch.start(); -// for (int round = 0; round < repetitions; round++) { -// resultNumber = basicSpellChecker.lookup("hockie", verbosity, maxEditDistance).size(); -// } -// stopWatch.stop(); -// totalLookupTime += stopWatch.getTime(TimeUnit.NANOSECONDS); -// totalMatches += resultNumber; -// System.out.println("Lookup instance " + resultNumber + " results " + ( -// stopWatch.getTime(TimeUnit.NANOSECONDS) / repetitions) -// + "ns/op verbosity=" + verbosity + " query=non-exact"); -// totalRepetitions += repetitions; -// -// //instantiated mix - stopWatch.reset(); - stopWatch.start(); - resultNumber = 0; - for (String word : query1K) { - resultNumber += basicSpellChecker.lookup(word, verbosity, maxEditDistance).size(); - } - stopWatch.stop(); - totalLookupTime += stopWatch.getTime(TimeUnit.NANOSECONDS); - totalMatches += resultNumber; - System.out.println("Lookup instance " + resultNumber + " results " + ( - stopWatch.getTime(TimeUnit.NANOSECONDS) / query1K.size()) - + "ns/op verbosity=" + verbosity + " query=mix"); - totalRepetitions += repetitions; -// } - System.out.println(); - dataHolder.clear(); - dataHolder = null; - basicSpellChecker = null; - System.out.println("Cleaning GC started"); - System.gc(); - System.out.println("Cleaning GC completed"); - } - - } - - System.out.println( - "Average Precalculation time instance " + (totalLoadTime / totalLoopCount) + "ms"); - - System.out.println( - "Average Precalculation memory instance " + (totalMem / totalLoopCount) + "MB "); - - System.out.println( - "Average Lookup time instance " + (totalLookupTime / totalRepetitions) + "ns"); - - System.out.println("Total Lookup results instance " + totalMatches); - - - } - - private List buildQuery1K() throws IOException { - List testList = new ArrayList<>(); - - URL resourceUrl = this.getClass().getClassLoader().getResource(queryFiles[0]); - CSVParser parser = CSVParser - .parse(resourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(' ')); - Iterator csvIterator = parser.iterator(); - while (csvIterator.hasNext()) { - CSVRecord csvRecord = csvIterator.next(); - testList.add(csvRecord.get(0)); - } - return testList; - } -} diff --git a/symspell-benchmark/src/main/java/io/github/symspell/benchmark/util/BenchmarkHelper.java b/symspell-benchmark/src/main/java/io/github/symspell/benchmark/util/BenchmarkHelper.java deleted file mode 100644 index 1bf1a99..0000000 --- a/symspell-benchmark/src/main/java/io/github/symspell/benchmark/util/BenchmarkHelper.java +++ /dev/null @@ -1,88 +0,0 @@ -package io.github.symspell.benchmark.util; - -import io.github.mightguy.spellcheck.symspell.api.DataHolder; -import io.github.mightguy.spellcheck.symspell.api.SpellChecker; -import io.github.mightguy.spellcheck.symspell.common.DictionaryItem; -import io.github.mightguy.spellcheck.symspell.common.Murmur3HashFunction; -import io.github.mightguy.spellcheck.symspell.common.SpellCheckSettings; -import io.github.mightguy.spellcheck.symspell.common.SuggestionItem; -import io.github.mightguy.spellcheck.symspell.common.Verbosity; -import io.github.mightguy.spellcheck.symspell.common.WeightedDamerauLevenshteinDistance; -import io.github.mightguy.spellcheck.symspell.exception.SpellCheckException; -import io.github.mightguy.spellcheck.symspell.impl.InMemoryDataHolder; -import io.github.mightguy.spellcheck.symspell.impl.SymSpellCheck; -import java.io.IOException; -import java.net.URL; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import lombok.Getter; -import org.apache.commons.csv.CSVFormat; -import org.apache.commons.csv.CSVParser; -import org.apache.commons.csv.CSVRecord; - -@Getter -public class BenchmarkHelper { - - private final Map words = new HashMap<>(); - private final List queries = new ArrayList<>(); - - public BenchmarkHelper(String dataResourceName, String queryResourceName) throws IOException { - URL resourceUrl = this.getClass().getClassLoader().getResource(dataResourceName); - CSVParser parser = CSVParser - .parse(resourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(' ')); - Iterator csvIterator = parser.iterator(); - while (csvIterator.hasNext()) { - CSVRecord csvRecord = csvIterator.next(); - words.put(csvRecord.get(0), Double.valueOf(csvRecord.get(1))); - } - - URL queryResourceUrl = this.getClass().getClassLoader().getResource(queryResourceName); - CSVParser qparser = CSVParser - .parse(queryResourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(' ')); - csvIterator = qparser.iterator(); - while (csvIterator.hasNext()) { - CSVRecord csvRecord = csvIterator.next(); - queries.add(new Query(csvRecord.get(0), csvRecord.get(1), Double.valueOf(csvRecord.get(2)))); - } - System.out.println(queries.size() + " Query Size"); - System.out.println(words.size() + " Data Size"); - } - - public boolean index(SpellChecker spellChecker) throws SpellCheckException { - int count = 0; - for (Map.Entry entry : words.entrySet()) { - spellChecker.getDataHolder().addItem(new DictionaryItem(entry.getKey(), entry.getValue(), - spellChecker.getSpellCheckSettings().getMaxEditDistance())); - count++; - } - System.out.println("Ingestion Completed " + count); - return true; - } - - public boolean testResult(Query searchQuery, SpellChecker spellChecker, Verbosity verbosity) - throws SpellCheckException { - List suggestionItems = spellChecker - .lookup(searchQuery.getTestString(), verbosity, - spellChecker.getSpellCheckSettings().getMaxEditDistance()); - return suggestionItems.parallelStream() - .anyMatch(s -> s.getTerm().equals(searchQuery.getExpectedString())); - } - - public SpellChecker getDefaultSymSpellChecker() { - SpellCheckSettings spellCheckSettings = SpellCheckSettings.builder().maxEditDistance(5).build(); - - WeightedDamerauLevenshteinDistance weightedDamerauLevenshteinDistance = - new WeightedDamerauLevenshteinDistance(spellCheckSettings.getDeletionWeight(), - spellCheckSettings.getInsertionWeight(), spellCheckSettings.getReplaceWeight(), - spellCheckSettings.getTranspositionWeight(), null); - DataHolder dataHolder = new InMemoryDataHolder(spellCheckSettings, new Murmur3HashFunction()); - - return new SymSpellCheck(dataHolder, weightedDamerauLevenshteinDistance, - spellCheckSettings); - } - -} diff --git a/symspell-benchmark/src/main/java/io/github/symspell/benchmark/util/Query.java b/symspell-benchmark/src/main/java/io/github/symspell/benchmark/util/Query.java deleted file mode 100644 index 0c85e2e..0000000 --- a/symspell-benchmark/src/main/java/io/github/symspell/benchmark/util/Query.java +++ /dev/null @@ -1,13 +0,0 @@ -package io.github.symspell.benchmark.util; - -import lombok.AllArgsConstructor; -import lombok.Getter; - -@AllArgsConstructor -@Getter -public class Query { - - private String testString; - private String expectedString; - private double editDistance; -} diff --git a/symspell-console/src/main/java/io/github/mightguy/spellcheckconsole/SpellCheckerConsole.java b/symspell-console/src/main/java/io/github/mightguy/spellcheckconsole/SpellCheckerConsole.java index 19f742c..466d349 100644 --- a/symspell-console/src/main/java/io/github/mightguy/spellcheckconsole/SpellCheckerConsole.java +++ b/symspell-console/src/main/java/io/github/mightguy/spellcheckconsole/SpellCheckerConsole.java @@ -106,6 +106,12 @@ private void suggestItemOnConsole() throws IOException, SpellCheckException { } } + /** + * Main Method for Console + * @param args + * @throws IOException + * @throws SpellCheckException + */ public static void main(String[] args) throws IOException, SpellCheckException { SpellCheckerConsole spellCheckerConsole = new SpellCheckerConsole(); spellCheckerConsole.init(); diff --git a/symspell-lib/pom.xml b/symspell-lib/pom.xml index e65ce23..582b305 100644 --- a/symspell-lib/pom.xml +++ b/symspell-lib/pom.xml @@ -37,6 +37,19 @@ org.slf4j slf4j-api + + org.openjdk.jmh + jmh-core + + + org.openjdk.jmh + jmh-generator-annprocess + provided + + + org.apache.commons + commons-csv + diff --git a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/QwertyDistance.java b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/QwertyDistance.java index 099d7cd..a2924f9 100644 --- a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/QwertyDistance.java +++ b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/QwertyDistance.java @@ -33,6 +33,9 @@ public double distance(char a, char b) { return this.operationCost[a][b]; } + /** + * Initializing the cost matrix + */ public void initializeCostMatrix() { for (double[] row : this.operationCost) { Arrays.fill(row, defaultValue); diff --git a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SpellHelper.java b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SpellHelper.java index 05c2ae5..5c6997c 100644 --- a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SpellHelper.java +++ b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SpellHelper.java @@ -85,6 +85,12 @@ public static boolean isEqualDouble(double d1, double d2, final double threshold return Math.abs(d1 - d2) < threshold; } + /** + * Check if heads are same + * @param suggestions + * @param suggestions1 + * @return boolean + */ public static boolean continueConditionIfHeadIsSame(List suggestions, List suggestions1) { return CollectionUtils.isEmpty(suggestions1) diff --git a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SuggestionItem.java b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SuggestionItem.java index fc8bfc1..69ad9eb 100644 --- a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SuggestionItem.java +++ b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SuggestionItem.java @@ -28,6 +28,12 @@ public class SuggestionItem implements Comparator, Comparable afterIteration(BenchmarkParams bp, IterationParams ip, + IterationResult result) { + MemoryUsage heapUsage = ManagementFactory.getMemoryMXBean().getHeapMemoryUsage(); + MemoryUsage nonheapUsage = ManagementFactory.getMemoryMXBean().getNonHeapMemoryUsage(); + + Collection results = new ArrayList<>(); + results.add( + new ScalarResult(Defaults.PREFIX + "mem.heap", heapUsage.getUsed() / (1024 * 1024.0), "MB", + AggregationPolicy.MAX)); + results.add(new ScalarResult( + Defaults.PREFIX + "mem.nonheap", nonheapUsage.getUsed() / (1024 * 1024.0), "MB", + AggregationPolicy.MAX)); + + return results; + } + +} \ No newline at end of file diff --git a/symspell-lib/src/test/java/io/github/mightguy/spellcheck/symspell/benchmark/SymSpellIndexBenchMark.java b/symspell-lib/src/test/java/io/github/mightguy/spellcheck/symspell/benchmark/SymSpellIndexBenchMark.java new file mode 100644 index 0000000..49291c7 --- /dev/null +++ b/symspell-lib/src/test/java/io/github/mightguy/spellcheck/symspell/benchmark/SymSpellIndexBenchMark.java @@ -0,0 +1,141 @@ +package io.github.mightguy.spellcheck.symspell.benchmark; + +import io.github.mightguy.spellcheck.symspell.api.CharDistance; +import io.github.mightguy.spellcheck.symspell.api.DataHolder; +import io.github.mightguy.spellcheck.symspell.api.SpellChecker; +import io.github.mightguy.spellcheck.symspell.api.StringDistance; +import io.github.mightguy.spellcheck.symspell.common.DictionaryItem; +import io.github.mightguy.spellcheck.symspell.common.Murmur3HashFunction; +import io.github.mightguy.spellcheck.symspell.common.SpellCheckSettings; +import io.github.mightguy.spellcheck.symspell.common.WeightedDamerauLevenshteinDistance; +import io.github.mightguy.spellcheck.symspell.exception.SpellCheckException; +import io.github.mightguy.spellcheck.symspell.impl.InMemoryDataHolder; +import io.github.mightguy.spellcheck.symspell.impl.SymSpellCheck; +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.nio.charset.Charset; +import java.nio.file.Paths; +import java.util.concurrent.TimeUnit; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.junit.Test; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.results.format.ResultFormatType; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +@State(Scope.Benchmark) +public class SymSpellIndexBenchMark { + + + @Param({"1.0d", "2.0d", "3.0d"}) + public double maxEditDistance; + + @Param({"frequency_dictionary_en_30_000.txt", "frequency_dictionary_en_82_765.txt", + "frequency_dictionary_en_500_000.txt"}) + public String dataFile; + + public String queryFile = "noisy_query_en_1000.txt"; + public SpellChecker spellChecker; + private static long totalMatches = 0; + + @Setup(Level.Iteration) + public void setup() throws SpellCheckException, IOException { + SpellCheckSettings spellCheckSettings = SpellCheckSettings.builder() + .maxEditDistance(maxEditDistance).build(); + + DataHolder dataHolder = new InMemoryDataHolder(spellCheckSettings, + new Murmur3HashFunction()); + + spellChecker = new SymSpellCheck(dataHolder, + getStringDistance(spellCheckSettings, null), + spellCheckSettings); + + } + + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MILLISECONDS) + @Measurement(iterations = 1, time = 10, timeUnit = TimeUnit.SECONDS) + public void searchBenchmark() throws SpellCheckException, IOException, InterruptedException { + indexData(dataFile, spellChecker.getDataHolder()); + System.out.println(" DataHolder Indexed Size " + spellChecker.getDataHolder().getSize()); + Thread.sleep(10000); + } + + @TearDown(Level.Iteration) + public void tearDown() { + spellChecker = null; + } + + private StringDistance getStringDistance(SpellCheckSettings spellCheckSettings, + CharDistance charDistance) { + return new WeightedDamerauLevenshteinDistance(spellCheckSettings.getDeletionWeight(), + spellCheckSettings.getInsertionWeight(), + spellCheckSettings.getReplaceWeight(), + spellCheckSettings.getTranspositionWeight(), charDistance); + } + + private void indexData(String dataResourceName, DataHolder dataHolder) + throws IOException, SpellCheckException { + URL resourceUrl = this.getClass().getClassLoader().getResource(dataResourceName); + CSVParser parser = CSVParser + .parse(resourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(' ')); + java.util.Iterator csvIterator = parser.iterator(); + while (csvIterator.hasNext()) { + CSVRecord csvRecord = csvIterator.next(); + dataHolder + .addItem(new DictionaryItem(csvRecord.get(0), Double.valueOf(csvRecord.get(1)), 0d)); + } + } + + + @Test + public void testBenchmarkIndex() throws RunnerException, IOException { + File file = checkFileAndCreate(SymSpellIndexBenchMark.class.getName()); + Options opt = new OptionsBuilder() + .include(SymSpellIndexBenchMark.class.getSimpleName()) + .addProfiler(MemoryProfiler.class.getName()) + .resultFormat(ResultFormatType.JSON) + .result(file.getAbsolutePath()) + .warmupIterations(0) + .measurementIterations(1) + .forks(1) + .build(); + new Runner(opt).run(); + System.out.println("Total Lookup results instance " + totalMatches); + + } + + private File checkFileAndCreate(String name) throws IOException { + String targetFolderPath = Paths.get( + this.getClass().getResource("/").getFile()).getParent().toString() + "/benchmark-result/"; + + File targetFolder = new File(targetFolderPath); + targetFolder.mkdirs(); + + File file = new File( + targetFolder + SymSpellIndexBenchMark.class.getSimpleName() + + "_" + System.currentTimeMillis() + ".json"); + if (file.exists()) { + file.delete(); + } + file.createNewFile(); + return file; + } +} diff --git a/symspell-lib/src/test/java/io/github/mightguy/spellcheck/symspell/benchmark/SymSpellSearchBenchMark.java b/symspell-lib/src/test/java/io/github/mightguy/spellcheck/symspell/benchmark/SymSpellSearchBenchMark.java new file mode 100644 index 0000000..b0a526c --- /dev/null +++ b/symspell-lib/src/test/java/io/github/mightguy/spellcheck/symspell/benchmark/SymSpellSearchBenchMark.java @@ -0,0 +1,168 @@ +package io.github.mightguy.spellcheck.symspell.benchmark; + +import io.github.mightguy.spellcheck.symspell.api.CharDistance; +import io.github.mightguy.spellcheck.symspell.api.DataHolder; +import io.github.mightguy.spellcheck.symspell.api.SpellChecker; +import io.github.mightguy.spellcheck.symspell.api.StringDistance; +import io.github.mightguy.spellcheck.symspell.common.DictionaryItem; +import io.github.mightguy.spellcheck.symspell.common.Murmur3HashFunction; +import io.github.mightguy.spellcheck.symspell.common.SpellCheckSettings; +import io.github.mightguy.spellcheck.symspell.common.Verbosity; +import io.github.mightguy.spellcheck.symspell.common.WeightedDamerauLevenshteinDistance; +import io.github.mightguy.spellcheck.symspell.exception.SpellCheckException; +import io.github.mightguy.spellcheck.symspell.impl.InMemoryDataHolder; +import io.github.mightguy.spellcheck.symspell.impl.SymSpellCheck; +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.nio.charset.Charset; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.junit.Test; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.results.format.ResultFormatType; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +@State(Scope.Benchmark) +public class SymSpellSearchBenchMark { + + @Param({"TOP", "CLOSEST", "ALL"}) + private String verbosity; + + @Param({"1.0d", "2.0d", "3.0d"}) + public double maxEditDistance; + + @Param({"frequency_dictionary_en_30_000.txt", "frequency_dictionary_en_82_765.txt", + "frequency_dictionary_en_500_000.txt"}) + public String dataFile; + + public String queryFile = "noisy_query_en_1000.txt"; + public List queries = readQueries(queryFile); + public SpellChecker spellChecker; + private static long totalMatches = 0; + + @Setup(Level.Iteration) + public void setup() throws SpellCheckException, IOException { + SpellCheckSettings spellCheckSettings = SpellCheckSettings.builder() + .maxEditDistance(maxEditDistance).build(); + + DataHolder dataHolder = new InMemoryDataHolder(spellCheckSettings, + new Murmur3HashFunction()); + + spellChecker = new SymSpellCheck(dataHolder, + getStringDistance(spellCheckSettings, null), + spellCheckSettings); + indexData(dataFile, dataHolder); + System.out.println(" DataHolder Indexed Size " + dataHolder.getSize() + ); + + } + + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MILLISECONDS) + @Measurement(iterations = 1) + public void searchBenchmark() throws SpellCheckException { + for (String query : queries) { + totalMatches += spellChecker.lookup(query, Verbosity.valueOf(verbosity), maxEditDistance) + .size(); + } + } + + @TearDown(Level.Iteration) + public void tearDown() { + spellChecker = null; + } + + private StringDistance getStringDistance(SpellCheckSettings spellCheckSettings, + CharDistance charDistance) { + return new WeightedDamerauLevenshteinDistance(spellCheckSettings.getDeletionWeight(), + spellCheckSettings.getInsertionWeight(), + spellCheckSettings.getReplaceWeight(), + spellCheckSettings.getTranspositionWeight(), charDistance); + } + + private void indexData(String dataResourceName, DataHolder dataHolder) + throws IOException, SpellCheckException { + URL resourceUrl = this.getClass().getClassLoader().getResource(dataResourceName); + CSVParser parser = CSVParser + .parse(resourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(' ')); + java.util.Iterator csvIterator = parser.iterator(); + while (csvIterator.hasNext()) { + CSVRecord csvRecord = csvIterator.next(); + dataHolder + .addItem(new DictionaryItem(csvRecord.get(0), Double.valueOf(csvRecord.get(1)), 0d)); + } + } + + private List readQueries(String queryFile) { + List queries = new ArrayList<>(); + try { + URL queryResourceUrl = this.getClass().getClassLoader().getResource(queryFile); + CSVParser qparser = CSVParser + .parse(queryResourceUrl, Charset.forName("UTF-8"), + CSVFormat.DEFAULT.withDelimiter(' ')); + java.util.Iterator csvIterator = qparser.iterator(); + while (csvIterator.hasNext()) { + CSVRecord csvRecord = csvIterator.next(); + queries.add(csvRecord.get(0)); + } + } catch (IOException ex) { + System.err.println("Error occured " + ex); + } + return queries; + } + + @Test + public void testBenchmarkSearch() throws RunnerException, IOException { + File file = checkFileAndCreate(SymSpellSearchBenchMark.class.getName()); + Options opt = new OptionsBuilder() + .include(SymSpellSearchBenchMark.class.getSimpleName()) + .addProfiler(MemoryProfiler.class.getName()) + .resultFormat(ResultFormatType.JSON) + .result(file.getAbsolutePath()) + .warmupIterations(0) + .measurementIterations(1) + .forks(1) + .build(); + new Runner(opt).run(); + System.out.println("Total Lookup results instance " + totalMatches); + + } + + private File checkFileAndCreate(String name) throws IOException { + String targetFolderPath = Paths.get( + this.getClass().getResource("/").getFile()).getParent().toString() + "/benchmark-result/"; + + File targetFolder = new File(targetFolderPath); + targetFolder.mkdirs(); + + File file = new File( + targetFolder + SymSpellSearchBenchMark.class.getSimpleName() + + "_" + System.currentTimeMillis() + ".json"); + if (file.exists()) { + file.delete(); + } + file.createNewFile(); + return file; + } +} diff --git a/symspell-benchmark/src/main/resources/frequency_dictionary_en_30_000.txt b/symspell-lib/src/test/resources/frequency_dictionary_en_30_000.txt similarity index 100% rename from symspell-benchmark/src/main/resources/frequency_dictionary_en_30_000.txt rename to symspell-lib/src/test/resources/frequency_dictionary_en_30_000.txt diff --git a/symspell-benchmark/src/main/resources/frequency_dictionary_en_500_000.txt b/symspell-lib/src/test/resources/frequency_dictionary_en_500_000.txt similarity index 100% rename from symspell-benchmark/src/main/resources/frequency_dictionary_en_500_000.txt rename to symspell-lib/src/test/resources/frequency_dictionary_en_500_000.txt diff --git a/symspell-benchmark/src/main/resources/noisy_query_en_1000.txt b/symspell-lib/src/test/resources/noisy_query_en_1000.txt similarity index 100% rename from symspell-benchmark/src/main/resources/noisy_query_en_1000.txt rename to symspell-lib/src/test/resources/noisy_query_en_1000.txt diff --git a/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/eventlistner/CustomSpellCheckListner.java b/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/eventlistner/CustomSpellCheckListner.java index f42d8da..80ae553 100644 --- a/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/eventlistner/CustomSpellCheckListner.java +++ b/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/eventlistner/CustomSpellCheckListner.java @@ -28,6 +28,12 @@ public class CustomSpellCheckListner implements SolrEventListener { private final SpellChecker checker; private final List fieldArr; + /** + * Constructor for listner + * @param core + * @param checker + * @param fieldArr + */ public CustomSpellCheckListner(SolrCore core, SpellChecker checker, String[] fieldArr) { this.core = core; this.checker = checker; @@ -56,11 +62,22 @@ public void newSearcher(SolrIndexSearcher newSearcher, SolrIndexSearcher current } } + /** + * Init method + * @param args + */ @Override public void init(NamedList args) { // Nothing to do at init } + /** + * Relod method of spellcheck listner + * @param newSearcher + * @param checker + * @throws IOException + * @throws SpellCheckException + */ public void reload(SolrIndexSearcher newSearcher, SpellChecker checker) throws IOException, SpellCheckException { diff --git a/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/utils/SearchRequestUtil.java b/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/utils/SearchRequestUtil.java index 64ff1bb..f165d73 100644 --- a/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/utils/SearchRequestUtil.java +++ b/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/utils/SearchRequestUtil.java @@ -10,6 +10,16 @@ public final class SearchRequestUtil { private SearchRequestUtil() { } + /** + * Get Class from class loader + * @param className + * @param loader + * @param abstractClass + * @param subPackages + * @param args + * @param + * @return + */ public static T getClassFromLoader(String className, SolrResourceLoader loader, Class abstractClass, String[] subPackages, Object[] args) { Object obj = loader.newInstance(className, abstractClass, subPackages, new Class[0], args); @@ -19,6 +29,14 @@ public static T getClassFromLoader(String className, SolrResourceLoader load return (T) obj; } + /** + * Get value of Type T from named list + * @param namedList + * @param key + * @param def + * @param + * @return + */ public static T getFromNamedList(NamedList namedList, String key, T def) { T val = (T) namedList.get(key); if (val == null) { @@ -27,11 +45,23 @@ public static T getFromNamedList(NamedList namedList, String key, T def) { return val; } + /** + * Check if the result greater than the spellcheck threshold + * @param rsp + * @param spellCheckThreshold + * @return + */ public static boolean resultGreaterThanThreshold(SolrQueryResponse rsp, long spellCheckThreshold) { return !resultLessThanThreshold(rsp, spellCheckThreshold); } + /** + * Check if the result lesser than the spellcheck threshold + * @param rsp + * @param spellCheckThreshold + * @return + */ public static boolean resultLessThanThreshold(SolrQueryResponse rsp, long spellCheckThreshold) { if (null == rsp.getResponse()) { return true;