From 6b02c250ee3b762d9774b6342b0e54d678bc1e46 Mon Sep 17 00:00:00 2001 From: Lucky Sharma Date: Mon, 11 May 2020 05:49:22 +0530 Subject: [PATCH] Changes for the bigram as unigram support --- .../spellcheck/symspell/api/SpellChecker.java | 9 ++++++++- .../symspell/common/SpellCheckSettings.java | 6 ++++++ .../spellcheck/symspell/impl/InMemoryDataHolder.java | 3 ++- .../spellcheck/symspell/impl/SymSpellCheck.java | 10 ++++++++-- .../symspell/solr/component/SpellcheckComponent.java | 12 +++++++++++- .../mightguy/symspell/solr/utils/Constants.java | 1 + 6 files changed, 36 insertions(+), 5 deletions(-) diff --git a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/api/SpellChecker.java b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/api/SpellChecker.java index b291696..7e71c0e 100644 --- a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/api/SpellChecker.java +++ b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/api/SpellChecker.java @@ -40,9 +40,16 @@ public List lookupCompound(String word) throws SpellCheckExcepti return lookupCompound(word, spellCheckSettings.getMaxEditDistance()); } - public abstract List lookupCompound(String word, double editDistance) + + public abstract List lookupCompound(String word, double editDistance, + boolean tokenizeOnWhiteSpace) throws SpellCheckException; + public List lookupCompound(String word, double editDistance) + throws SpellCheckException { + return lookupCompound(word, spellCheckSettings.getMaxEditDistance(), true); + } + public Composition wordBreakSegmentation(String phrase) throws SpellCheckException { return wordBreakSegmentation(phrase, spellCheckSettings.getPrefixLength(), diff --git a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SpellCheckSettings.java b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SpellCheckSettings.java index 0e9b168..cbe8c58 100644 --- a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SpellCheckSettings.java +++ b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/common/SpellCheckSettings.java @@ -102,4 +102,10 @@ public class SpellCheckSettings { @Builder.Default private double editFactor = 0.3; + @Builder.Default + private boolean doKeySplit = true; + + @Builder.Default + private String keySplitRegex = "\\s+"; + } diff --git a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/impl/InMemoryDataHolder.java b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/impl/InMemoryDataHolder.java index e9c543f..4f2a2e7 100644 --- a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/impl/InMemoryDataHolder.java +++ b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/impl/InMemoryDataHolder.java @@ -131,7 +131,8 @@ public boolean addItem(final DictionaryItem dictionaryItem) throws SpellCheckExc private boolean addToDictionary(String key, double frequency) { - if (key.split("\\s+").length > 1) { + if (spellCheckSettings.isDoKeySplit() + && key.split(spellCheckSettings.getKeySplitRegex()).length > 1) { bigramsDictionary.put(key, frequency); if (frequency < spellCheckSettings.getBigramCountMin()) { spellCheckSettings.setBigramCountMin(frequency); diff --git a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/impl/SymSpellCheck.java b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/impl/SymSpellCheck.java index de6195e..f29b47d 100644 --- a/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/impl/SymSpellCheck.java +++ b/symspell-lib/src/main/java/io/github/mightguy/spellcheck/symspell/impl/SymSpellCheck.java @@ -48,7 +48,8 @@ public SymSpellCheck(DataHolder dataHolder, * the input string. */ @Override - public List lookupCompound(String phrase, double maxEditDistance) + public List lookupCompound(String phrase, double maxEditDistance, + boolean tokenizeOnWhiteSpace) throws SpellCheckException { if (maxEditDistance > spellCheckSettings.getMaxEditDistance()) { @@ -63,7 +64,12 @@ public List lookupCompound(String phrase, double maxEditDistance if (spellCheckSettings.isLowerCaseTerms()) { phrase = phrase.toLowerCase(); } - String[] items = SpellHelper.tokenizeOnWhiteSpace(phrase); + String[] items; + if (tokenizeOnWhiteSpace) { + items = SpellHelper.tokenizeOnWhiteSpace(phrase); + } else { + items = new String[]{phrase}; + } List suggestions = new ArrayList<>(); List suggestionParts = new ArrayList<>(); boolean isLastCombi = false; diff --git a/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/component/SpellcheckComponent.java b/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/component/SpellcheckComponent.java index 8691ebc..3075a62 100644 --- a/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/component/SpellcheckComponent.java +++ b/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/component/SpellcheckComponent.java @@ -82,8 +82,14 @@ public void process(ResponseBuilder rb) throws IOException { } SolrParams params = rb.req.getParams(); String q = params.get(Constants.SPELLCHECK_Q, params.get(CommonParams.Q)); + boolean sow = params.getBool(Constants.SPELLCHECK_SOW, true); + List suggestions; try { - List suggestions = spellChecker.lookupCompound(q); + if (sow) { + suggestions = spellChecker.lookupCompound(q); + } else { + suggestions = spellChecker.lookupCompound(q, 2, false); + } if (!CollectionUtils.isEmpty(suggestions)) { addToResponse(rb, suggestions); } @@ -183,6 +189,10 @@ private void addSpellChecker(SolrCore core, NamedList spellcheckerNL) { SearchRequestUtil .getFromNamedList(spellcheckerNL, "verbosity", Verbosity.ALL.name()))) .countThreshold(SearchRequestUtil.getFromNamedList(spellcheckerNL, "countThreshold", 10)) + .doKeySplit( + SearchRequestUtil.getFromNamedList(spellcheckerNL, "createBigram", true)) + .keySplitRegex( + SearchRequestUtil.getFromNamedList(spellcheckerNL, "bigramSplitRegex", "\\s+")) .build(); StringDistance stringDistance = getStringDistance(spellcheckerNL, spellCheckSettings, core); diff --git a/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/utils/Constants.java b/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/utils/Constants.java index 2ae0247..a8ec554 100644 --- a/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/utils/Constants.java +++ b/symspell-solr/src/main/java/io/github/mightguy/symspell/solr/utils/Constants.java @@ -13,4 +13,5 @@ private Constants() { public static final String SPELLCHECK_THRESHOLD = SPELLCHECK_PREFIX + "threshold"; public static final String SPELLCHECK_ENABLE = SPELLCHECK_PREFIX + "enable"; public static final String SPELLCHECK_BUILD = SPELLCHECK_PREFIX + "build"; + public static final String SPELLCHECK_SOW = SPELLCHECK_PREFIX + "sow"; }