From 994f3dce3487455722c523f202c8b254bf2f94ae Mon Sep 17 00:00:00 2001 From: Martin Wiesner Date: Mon, 1 Jul 2024 11:48:46 +0200 Subject: [PATCH] OPENNLP-1586 Prevent resource leaks in BrownCluster and WordClusterDictionary - fixes the resource leaks in the related classes - improves JavaDoc along the path --- .../tools/util/featuregen/BrownCluster.java | 44 ++++++++++--------- .../BrownTokenClassFeatureGenerator.java | 6 +++ .../util/featuregen/BrownTokenClasses.java | 4 +- .../featuregen/WordClusterDictionary.java | 27 ++++++------ .../WordClusterFeatureGenerator.java | 16 +++++++ .../WordClusterFeatureGeneratorFactory.java | 2 +- 6 files changed, 64 insertions(+), 35 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java index b4ddd2b77..887596624 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java @@ -39,6 +39,7 @@ *

* Originally available at: * http://metaoptimize.com/projects/wordreprs/. + *

* Further details can be found in the * related research paper. *

@@ -66,29 +67,32 @@ public void serialize(BrownCluster artifact, OutputStream out) private final Map tokenToClusterMap = new HashMap<>(); /** - * Generates the token to cluster map from Brown cluster an {@link InputStream}. + * Instatiates a {@link BrownCluster} and its related token to cluster map + * via an {@link InputStream}. *

* Note: - * we only add those tokens with frequency bigger than {@code 5}. + * Only tokens with frequency bigger than {@code 5} will be added. * * @param in A valid, open {@link InputStream} to read from. - * @throws IOException the io exception + * @throws IOException Thrown if errors occurred reading from {@link InputStream in}. */ public BrownCluster(InputStream in) throws IOException { - BufferedReader breader = - new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); - String line; - while ((line = breader.readLine()) != null) { - String[] lineArray = tabPattern.split(line); - if (lineArray.length == 3) { - int freq = Integer.parseInt(lineArray[2]); - if (freq > 5 ) { - tokenToClusterMap.put(lineArray[1], lineArray[0]); + try (BufferedReader breader = new BufferedReader( + new InputStreamReader(in, StandardCharsets.UTF_8))) { + + String line; + while ((line = breader.readLine()) != null) { + String[] lineArray = tabPattern.split(line); + if (lineArray.length == 3) { + int freq = Integer.parseInt(lineArray[2]); + if (freq > 5 ) { + tokenToClusterMap.put(lineArray[1], lineArray[0]); + } + } + else if (lineArray.length == 2) { + tokenToClusterMap.put(lineArray[0], lineArray[1]); } - } - else if (lineArray.length == 2) { - tokenToClusterMap.put(lineArray[0], lineArray[1]); } } } @@ -104,12 +108,12 @@ public String lookupToken(String string) { } public void serialize(OutputStream out) throws IOException { - Writer writer = new BufferedWriter(new OutputStreamWriter(out)); - - for (Map.Entry entry : tokenToClusterMap.entrySet()) { - writer.write(entry.getKey() + "\t" + entry.getValue() + "\n"); + try (Writer writer = new BufferedWriter(new OutputStreamWriter(out))) { + for (Map.Entry entry : tokenToClusterMap.entrySet()) { + writer.write(entry.getKey() + "\t" + entry.getValue() + "\n"); + } + writer.flush(); } - writer.flush(); } @Override diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownTokenClassFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownTokenClassFeatureGenerator.java index 4e86fb2a5..dde9ef668 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownTokenClassFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownTokenClassFeatureGenerator.java @@ -26,6 +26,12 @@ public class BrownTokenClassFeatureGenerator implements AdaptiveFeatureGenerator private final BrownCluster brownLexicon; + /** + * Instantiates a {@link BrownTokenClassFeatureGenerator} via a specified + * {@link BrownCluster}. + * + * @param dict The token {@link BrownCluster dictionary} to use. + */ public BrownTokenClassFeatureGenerator(BrownCluster dict) { this.brownLexicon = dict; } diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownTokenClasses.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownTokenClasses.java index a880b91c2..65277dd2f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownTokenClasses.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownTokenClasses.java @@ -18,12 +18,14 @@ package opennlp.tools.util.featuregen; import java.util.ArrayList; +import java.util.Collections; import java.util.List; /** * Obtain the paths listed in the pathLengths array from the Brown class. * This class is not to be instantiated. * + * @see BrownCluster */ public class BrownTokenClasses { @@ -39,7 +41,7 @@ public class BrownTokenClasses { */ public static List getWordClasses(String token, BrownCluster brownLexicon) { if (brownLexicon.lookupToken(token) == null) { - return new ArrayList<>(0); + return Collections.emptyList(); } else { String brownClass = brownLexicon.lookupToken(token); List pathLengthsList = new ArrayList<>(); diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterDictionary.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterDictionary.java index 2b1af7bcd..b4d917ffa 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterDictionary.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterDictionary.java @@ -57,14 +57,15 @@ public void serialize(WordClusterDictionary artifact, OutputStream out) throws I * @throws IOException Thrown if IO errors occurred during read. */ public WordClusterDictionary(InputStream in) throws IOException { - BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); - String line; - while ((line = reader.readLine()) != null) { - String[] parts = line.split(" "); - if (parts.length == 3) { - tokenToClusterMap.put(parts[0], parts[1].intern()); - } else if (parts.length == 2) { - tokenToClusterMap.put(parts[0], parts[1].intern()); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + String[] parts = line.split(" "); + if (parts.length == 3) { + tokenToClusterMap.put(parts[0], parts[1].intern()); + } else if (parts.length == 2) { + tokenToClusterMap.put(parts[0], parts[1].intern()); + } } } } @@ -74,13 +75,13 @@ public String lookupToken(String string) { } public void serialize(OutputStream out) throws IOException { - Writer writer = new BufferedWriter(new OutputStreamWriter(out)); + try (Writer writer = new BufferedWriter(new OutputStreamWriter(out))) { + for (Map.Entry entry : tokenToClusterMap.entrySet()) { + writer.write(entry.getKey() + " " + entry.getValue() + "\n"); + } - for (Map.Entry entry : tokenToClusterMap.entrySet()) { - writer.write(entry.getKey() + " " + entry.getValue() + "\n"); + writer.flush(); } - - writer.flush(); } @Override diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGenerator.java index f759d9c1e..b98cd9bba 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGenerator.java @@ -21,12 +21,28 @@ import opennlp.tools.util.StringUtil; +/** + * An {@link AdaptiveFeatureGenerator} implementation of a word cluster feature generator. + * It is based on a pre-defined {@link WordClusterDictionary}. + * + * @see WordClusterDictionary + */ public class WordClusterFeatureGenerator implements AdaptiveFeatureGenerator { private final WordClusterDictionary tokenDictionary; private final String resourceName; private final boolean lowerCaseDictionary; + /** + * Instantiates a {@link WordClusterFeatureGenerator} via a specified + * {@link WordClusterDictionary}. + * + * @param dict The token {@link WordClusterDictionary dictionary} to use. + * @param dictResourceKey The prefix to use for detected features. Typically, + * the value for this prefix should be {@code "dict"}. + * @param lowerCaseDictionary {@code true} if tokens will be lower-cased during + * dictionary lookup, {@code false} otherwise. + */ public WordClusterFeatureGenerator(WordClusterDictionary dict, String dictResourceKey, boolean lowerCaseDictionary) { tokenDictionary = dict; diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGeneratorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGeneratorFactory.java index 4381f1298..7ea19eddd 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGeneratorFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGeneratorFactory.java @@ -25,7 +25,7 @@ /** * Defines a word cluster generator factory; it reads an element containing - * 'w2vwordcluster' as a tag name; these clusters are typically produced by + * 'w2vwordcluster' as a tag name. These clusters are typically produced by * word2vec or clark pos induction systems. */ public class WordClusterFeatureGeneratorFactory