Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OPENNLP-1586 Prevent resource leaks in BrownCluster and WordClusterDictionary #630

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
* <p>
* Originally available at: <a href="http://metaoptimize.com/projects/wordreprs/">
* http://metaoptimize.com/projects/wordreprs/</a>.
* <p>
* Further details can be found in the
* <a href="https://dl.acm.org/doi/10.5555/1858681.1858721">related research paper</a>.
* <p>
Expand Down Expand Up @@ -66,29 +67,32 @@ public void serialize(BrownCluster artifact, OutputStream out)
private final Map<String, String> tokenToClusterMap = new HashMap<>();

/**
* Generates the token to cluster map from Brown cluster an {@link InputStream}.
* Instatiates a {@link BrownCluster} and its related token to cluster map
* via an {@link InputStream}.
* <p>
* <b>Note:</b>
* we only add those tokens with frequency bigger than {@code 5}.
* Only tokens with frequency bigger than {@code 5} will be added.
*
* @param in A valid, open {@link InputStream} to read from.
* @throws IOException the io exception
* @throws IOException Thrown if errors occurred reading from {@link InputStream in}.
*/
public BrownCluster(InputStream in) throws IOException {

BufferedReader breader =
new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
String line;
while ((line = breader.readLine()) != null) {
String[] lineArray = tabPattern.split(line);
if (lineArray.length == 3) {
int freq = Integer.parseInt(lineArray[2]);
if (freq > 5 ) {
tokenToClusterMap.put(lineArray[1], lineArray[0]);
try (BufferedReader breader = new BufferedReader(
new InputStreamReader(in, StandardCharsets.UTF_8))) {

String line;
while ((line = breader.readLine()) != null) {
String[] lineArray = tabPattern.split(line);
if (lineArray.length == 3) {
int freq = Integer.parseInt(lineArray[2]);
if (freq > 5 ) {
tokenToClusterMap.put(lineArray[1], lineArray[0]);
}
}
else if (lineArray.length == 2) {
tokenToClusterMap.put(lineArray[0], lineArray[1]);
}
}
else if (lineArray.length == 2) {
tokenToClusterMap.put(lineArray[0], lineArray[1]);
}
}
}
Expand All @@ -104,12 +108,12 @@ public String lookupToken(String string) {
}

public void serialize(OutputStream out) throws IOException {
Writer writer = new BufferedWriter(new OutputStreamWriter(out));

for (Map.Entry<String, String> entry : tokenToClusterMap.entrySet()) {
writer.write(entry.getKey() + "\t" + entry.getValue() + "\n");
try (Writer writer = new BufferedWriter(new OutputStreamWriter(out))) {
for (Map.Entry<String, String> entry : tokenToClusterMap.entrySet()) {
writer.write(entry.getKey() + "\t" + entry.getValue() + "\n");
}
writer.flush();
}
writer.flush();
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ public class BrownTokenClassFeatureGenerator implements AdaptiveFeatureGenerator

private final BrownCluster brownLexicon;

/**
* Instantiates a {@link BrownTokenClassFeatureGenerator} via a specified
* {@link BrownCluster}.
*
* @param dict The token {@link BrownCluster dictionary} to use.
*/
public BrownTokenClassFeatureGenerator(BrownCluster dict) {
this.brownLexicon = dict;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
package opennlp.tools.util.featuregen;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
* Obtain the paths listed in the pathLengths array from the Brown class.
* This class is not to be instantiated.
*
* @see BrownCluster
*/
public class BrownTokenClasses {

Expand All @@ -39,7 +41,7 @@ public class BrownTokenClasses {
*/
public static List<String> getWordClasses(String token, BrownCluster brownLexicon) {
if (brownLexicon.lookupToken(token) == null) {
return new ArrayList<>(0);
return Collections.emptyList();
} else {
String brownClass = brownLexicon.lookupToken(token);
List<String> pathLengthsList = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,15 @@ public void serialize(WordClusterDictionary artifact, OutputStream out) throws I
* @throws IOException Thrown if IO errors occurred during read.
*/
public WordClusterDictionary(InputStream in) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split(" ");
if (parts.length == 3) {
tokenToClusterMap.put(parts[0], parts[1].intern());
} else if (parts.length == 2) {
tokenToClusterMap.put(parts[0], parts[1].intern());
try (BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) {
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split(" ");
if (parts.length == 3) {
tokenToClusterMap.put(parts[0], parts[1].intern());
} else if (parts.length == 2) {
tokenToClusterMap.put(parts[0], parts[1].intern());
}
}
}
}
Expand All @@ -74,13 +75,13 @@ public String lookupToken(String string) {
}

public void serialize(OutputStream out) throws IOException {
Writer writer = new BufferedWriter(new OutputStreamWriter(out));
try (Writer writer = new BufferedWriter(new OutputStreamWriter(out))) {
for (Map.Entry<String, String> entry : tokenToClusterMap.entrySet()) {
writer.write(entry.getKey() + " " + entry.getValue() + "\n");
}

for (Map.Entry<String, String> entry : tokenToClusterMap.entrySet()) {
writer.write(entry.getKey() + " " + entry.getValue() + "\n");
writer.flush();
}

writer.flush();
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,28 @@

import opennlp.tools.util.StringUtil;

/**
* An {@link AdaptiveFeatureGenerator} implementation of a word cluster feature generator.
* It is based on a pre-defined {@link WordClusterDictionary}.
*
* @see WordClusterDictionary
*/
public class WordClusterFeatureGenerator implements AdaptiveFeatureGenerator {

private final WordClusterDictionary tokenDictionary;
private final String resourceName;
private final boolean lowerCaseDictionary;

/**
* Instantiates a {@link WordClusterFeatureGenerator} via a specified
* {@link WordClusterDictionary}.
*
* @param dict The token {@link WordClusterDictionary dictionary} to use.
* @param dictResourceKey The prefix to use for detected features. Typically,
* the value for this prefix should be {@code "dict"}.
* @param lowerCaseDictionary {@code true} if tokens will be lower-cased during
* dictionary lookup, {@code false} otherwise.
*/
public WordClusterFeatureGenerator(WordClusterDictionary dict,
String dictResourceKey, boolean lowerCaseDictionary) {
tokenDictionary = dict;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

/**
* Defines a word cluster generator factory; it reads an element containing
* 'w2vwordcluster' as a tag name; these clusters are typically produced by
* 'w2vwordcluster' as a tag name. These clusters are typically produced by
* word2vec or clark pos induction systems.
*/
public class WordClusterFeatureGeneratorFactory
Expand Down