diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java index 6f31f0fef..da7277a4a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java @@ -212,13 +212,13 @@ public NameContextGenerator createContextGenerator() { AdaptiveFeatureGenerator featureGenerator = createFeatureGenerators(); if (featureGenerator == null) { - featureGenerator = new CachedFeatureGenerator( + featureGenerator = new CachedFeatureGenerator(new AggregatedFeatureGenerator( new WindowFeatureGenerator(new TokenFeatureGenerator(), 2, 2), new WindowFeatureGenerator(new TokenClassFeatureGenerator(true), 2, 2), new OutcomePriorFeatureGenerator(), new PreviousMapFeatureGenerator(), new BigramNameFeatureGenerator(), - new SentenceFeatureGenerator(true, false)); + new SentenceFeatureGenerator(true, false))); } return new DefaultNameContextGenerator(featureGenerator); diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java index e26ffbdda..ff3fb6391 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java @@ -19,6 +19,7 @@ package opennlp.tools.util.featuregen; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import opennlp.tools.util.Cache; @@ -30,52 +31,42 @@ public class CachedFeatureGenerator implements AdaptiveFeatureGenerator { private final AdaptiveFeatureGenerator generator; - private String[] prevTokens; - - private final Cache> contextsCache; + private final Cache>> contextCaches; + private final int cacheSize; private long numberOfCacheHits; private long numberOfCacheMisses; - @Deprecated - public CachedFeatureGenerator(AdaptiveFeatureGenerator... generators) { - this.generator = new AggregatedFeatureGenerator(generators); - contextsCache = new Cache<>(100); + public CachedFeatureGenerator(AdaptiveFeatureGenerator generator, int cacheSize) { + this.generator = generator; + this.contextCaches = new Cache<>(cacheSize); + this.cacheSize = cacheSize; } public CachedFeatureGenerator(AdaptiveFeatureGenerator generator) { - this.generator = generator; - contextsCache = new Cache<>(100); + this(generator, 100); } @Override public void createFeatures(List features, String[] tokens, int index, - String[] previousOutcomes) { + String[] previousOutcomes) { - List cacheFeatures; + final int tokenHash = Arrays.hashCode(tokens); - if (tokens == prevTokens) { - cacheFeatures = contextsCache.get(index); - - if (cacheFeatures != null) { - numberOfCacheHits++; - features.addAll(cacheFeatures); - return; - } + final Cache> contextCache = contextCaches.computeIfAbsent(tokenHash, + k -> new Cache<>(cacheSize)); + List cacheFeatures = contextCache.get(index); + if (cacheFeatures != null) { + numberOfCacheHits++; + features.addAll(cacheFeatures); } else { - contextsCache.clear(); - prevTokens = tokens; + numberOfCacheMisses++; + cacheFeatures = new ArrayList<>(); + generator.createFeatures(cacheFeatures, tokens, index, previousOutcomes); + contextCache.put(index, cacheFeatures); + features.addAll(cacheFeatures); } - - cacheFeatures = new ArrayList<>(); - - numberOfCacheMisses++; - - generator.createFeatures(cacheFeatures, tokens, index, previousOutcomes); - - contextsCache.put(index, cacheFeatures); - features.addAll(cacheFeatures); } @Override @@ -102,6 +93,12 @@ public long getNumberOfCacheMisses() { return numberOfCacheMisses; } + public void clearCache() { + this.contextCaches.clear(); + this.numberOfCacheMisses = 0; + this.numberOfCacheHits = 0; + } + @Override public String toString() { return super.toString() + ": hits=" + numberOfCacheHits diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java index 058618520..6f89fe4ba 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java @@ -18,6 +18,7 @@ package opennlp.tools.util.featuregen; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.junit.jupiter.api.Assertions; @@ -47,6 +48,40 @@ void setUp() { features = new ArrayList<>(); } + @Test + void testCachingOfRealWorldSentence() { + CachedFeatureGenerator generator = new CachedFeatureGenerator(identityGenerator); + final String[] sentence = "He belongs to Apache \n Software Foundation .".split(" "); + int testIndex = 0; + + // after this call features are cached for testIndex + generator.createFeatures(features, sentence, testIndex, null); + Assertions.assertEquals(1, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(0, generator.getNumberOfCacheHits()); + + generator.createFeatures(features, sentence, testIndex, null); + Assertions.assertEquals(1, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(1, generator.getNumberOfCacheHits()); + + generator.createFeatures(features, sentence, testIndex + 1, null); + Assertions.assertEquals(2, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(1, generator.getNumberOfCacheHits()); + + generator.createFeatures(features, sentence, testIndex + 1, null); + Assertions.assertEquals(2, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(2, generator.getNumberOfCacheHits()); + + generator.clearCache(); + + Assertions.assertEquals(0, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(0, generator.getNumberOfCacheHits()); + + generator.createFeatures(features, sentence, testIndex + 1, null); + Assertions.assertEquals(1, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(0, generator.getNumberOfCacheHits()); + + } + /** * Tests if cache works for one sentence and two different token indexes. */ @@ -70,9 +105,7 @@ void testCachingOfSentence() { final String expectedToken = testSentence1[testIndex]; - testSentence1[testIndex] = null; - - generator.createFeatures(features, testSentence1, testIndex, null); + generator.createFeatures(features, Arrays.copyOf(testSentence1, testSentence1.length), testIndex, null); Assertions.assertEquals(1, generator.getNumberOfCacheMisses()); Assertions.assertEquals(1, generator.getNumberOfCacheHits()); @@ -86,7 +119,7 @@ void testCachingOfSentence() { int testIndex2 = testIndex + 1; - generator.createFeatures(features, testSentence1, testIndex2, null); + generator.createFeatures(features, Arrays.copyOf(testSentence1, testSentence1.length), testIndex2, null); Assertions.assertEquals(2, generator.getNumberOfCacheMisses()); Assertions.assertEquals(1, generator.getNumberOfCacheHits()); @@ -116,7 +149,7 @@ void testCacheClearAfterSentenceChange() { features.clear(); // use another sentence but same index - generator.createFeatures(features, testSentence2, testIndex, null); + generator.createFeatures(features, Arrays.copyOf(testSentence2, testSentence2.length), testIndex, null); Assertions.assertEquals(2, generator.getNumberOfCacheMisses()); Assertions.assertEquals(0, generator.getNumberOfCacheHits()); @@ -128,10 +161,7 @@ void testCacheClearAfterSentenceChange() { // check if features are really cached final String expectedToken = testSentence2[testIndex]; - - testSentence2[testIndex] = null; - - generator.createFeatures(features, testSentence2, testIndex, null); + generator.createFeatures(features, Arrays.copyOf(testSentence2, testSentence2.length), testIndex, null); Assertions.assertTrue(features.contains(expectedToken)); Assertions.assertEquals(1, features.size());