From e5cb04ab5127602797fc2eeb97f73bd2f07b2069 Mon Sep 17 00:00:00 2001 From: Thilo Goetz Date: Tue, 17 Jan 2017 16:40:46 +0100 Subject: [PATCH 1/3] OPENNLP-936: Add thread-safe versions of some tools. Thread safe versions of POSTaggerME, SentenceDetectorME and TokenizerME. Include test case as well. --- .../opennlp/tools/postag/POSTaggerME_TS.java | 65 ++++++++++++++ .../sentdetect/SentenceDetectorME_TS.java | 65 ++++++++++++++ .../tools/tokenize/TokenizerME_TS.java | 59 ++++++++++++ .../tools/eval/MultiThreadedToolsEval.java | 90 +++++++++++++++++++ 4 files changed, 279 insertions(+) create mode 100644 opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME_TS.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME_TS.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME_TS.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME_TS.java b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME_TS.java new file mode 100644 index 000000000..c6e77017e --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME_TS.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.postag; + +import opennlp.tools.util.Sequence; + +/** + * A thread-safe version of the POSTaggerME. Using it is completely transparent. You can use it in + * a single-threaded context as well, it only incurs a minimal overhead. + */ +public class POSTaggerME_TS implements POSTagger { + + private POSModel model; + + private ThreadLocal threadLocal = new ThreadLocal<>(); + + public POSTaggerME_TS(POSModel model) { + super(); + this.model = model; + } + + private final POSTaggerME getTagger() { + POSTaggerME tagger = threadLocal.get(); + if (tagger == null) { + tagger = new POSTaggerME(model); + threadLocal.set(tagger); + } + return tagger; + } + + @Override + public String[] tag(String[] sentence) { + return getTagger().tag(sentence); + } + + @Override + public String[] tag(String[] sentence, Object[] additionaContext) { + return getTagger().tag(sentence, additionaContext); + } + + @Override + public Sequence[] topKSequences(String[] sentence) { + return getTagger().topKSequences(sentence); + } + + @Override + public Sequence[] topKSequences(String[] sentence, Object[] additionaContext) { + return getTagger().topKSequences(sentence, additionaContext); + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME_TS.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME_TS.java new file mode 100644 index 000000000..1c68af8a3 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME_TS.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentdetect; + +import opennlp.tools.util.Span; + +/** + * A thread-safe version of SentenceDetectorME. Using it is completely transparent. You can use it in + * a single-threaded context as well, it only incurs a minimal overhead. + *

+ * Note, however, that this implementation uses a ThreadLocal. Although the implementation is + * lightweight as the model is not duplicated, if you have many long-running threads, you may run + * into memory issues. Be careful when you use this in a JEE application, for example. + */ +public class SentenceDetectorME_TS implements SentenceDetector { + + private SentenceModel model; + + private ThreadLocal sentenceDetectorThreadLocal = + new ThreadLocal<>(); + + public SentenceDetectorME_TS(SentenceModel model) { + super(); + this.model = model; + } + + // If a thread-local version exists, return it. Otherwise create, then return. + private SentenceDetectorME getSD() { + SentenceDetectorME sd = sentenceDetectorThreadLocal.get(); + if (sd == null) { + sd = new SentenceDetectorME(model); + sentenceDetectorThreadLocal.set(sd); + } + return sd; + } + + @Override + public String[] sentDetect(String s) { + return getSD().sentDetect(s); + } + + @Override + public Span[] sentPosDetect(String s) { + return getSD().sentPosDetect(s); + } + + public double[] getSentenceProbabilities() { + return getSD().getSentenceProbabilities(); + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME_TS.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME_TS.java new file mode 100644 index 000000000..6c2ed3580 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME_TS.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.tokenize; + +import opennlp.tools.util.Span; + +/** + * A thread-safe version of TokenizerME. Using it is completely transparent. You can use it in + * a single-threaded context as well, it only incurs a minimal overhead. + */ +public class TokenizerME_TS implements Tokenizer { + + private TokenizerModel model; + + private ThreadLocal tokenizerThreadLocal = new ThreadLocal(); + + public TokenizerME_TS(TokenizerModel model) { + super(); + this.model = model; + } + + private TokenizerME getTokenizer() { + TokenizerME tokenizer = tokenizerThreadLocal.get(); + if (tokenizer == null) { + tokenizer = new TokenizerME(model); + tokenizerThreadLocal.set(tokenizer); + } + return tokenizer; + } + + @Override + public String[] tokenize(String s) { + return getTokenizer().tokenize(s); + } + + @Override + public Span[] tokenizePos(String s) { + return getTokenizer().tokenizePos(s); + } + + public double[] getProbabilities() { + return getTokenizer().getTokenProbabilities(); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java new file mode 100644 index 000000000..ec41b1bf9 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.eval; + +import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSTaggerME_TS; +import opennlp.tools.sentdetect.SentenceDetectorME_TS; +import opennlp.tools.sentdetect.SentenceModel; +import opennlp.tools.tokenize.TokenizerME_TS; +import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.util.Span; + +import org.junit.Test; + +import java.io.File; +import java.io.IOException; + +/** + * Test the the reentrant tools implementations are really thread safe by running the concurrently. + * Replace the thread-safe versions with the non-safe versions to see this test case fail. + */ +public class MultiThreadedToolsEval { + + @Test + public void runMEToolsMultiThreaded() throws IOException, InterruptedException { + + File sModelFile = new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-sent.bin"); + SentenceModel sModel = new SentenceModel(sModelFile); + SentenceDetectorME_TS sentencer = new SentenceDetectorME_TS(sModel); + + File tModelFile = new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-token.bin"); + TokenizerModel tModel = new TokenizerModel(tModelFile); + TokenizerME_TS tokenizer = new TokenizerME_TS(tModel); + + File pModelFile = new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-maxent.bin"); + POSModel pModel = new POSModel(pModelFile); + POSTaggerME_TS tagger = new POSTaggerME_TS(pModel); + + final String text = "All human beings are born free and equal in dignity and rights. They " + + "are endowed with reason and conscience and should act towards one another in a " + + "spirit of brotherhood."; + + // Run numThreads threads, each processing the sample text numRunsPerThread times. + final int numThreads = 8; + final int numRunsPerThread = 1000; + Thread[] threads = new Thread[numThreads]; + + for (int i = 0; i < 8; i++) { + threads[i] = new Thread(new Runnable() { + @Override + public void run() { + for (int j = 0; j < numRunsPerThread; j++) { + Span[] sentences = sentencer.sentPosDetect(text); + for (Span span : sentences) { + String sentence = text.substring(span.getStart(), span.getEnd()); + Span[] tokens = tokenizer.tokenizePos(sentence); + String[] tokenStrings = new String[tokens.length]; + for (int k = 0; k < tokens.length; k++) { + tokenStrings[k] = sentence.substring(tokens[k].getStart(), + tokens[k].getEnd()); + } + String[] tags = tagger.tag(tokenStrings); + } + } + } + }); + threads[i].start(); + } + for (Thread t : threads) { + t.join(); + } + + } + +} From fe65d81cd1a799826678500c45b3089ef154fff4 Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Tue, 1 Oct 2024 13:50:41 +0200 Subject: [PATCH 2/3] Fix checkstyle and adjust code --- .../sentdetect/SentenceDetectorME_TS.java | 12 +++++------ .../tools/eval/MultiThreadedToolsEval.java | 20 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME_TS.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME_TS.java index 1c68af8a3..0f9a3f7a6 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME_TS.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME_TS.java @@ -49,17 +49,17 @@ private SentenceDetectorME getSD() { return sd; } + public double[] getSentenceProbabilities() { + return getSD().getSentenceProbabilities(); + } + @Override - public String[] sentDetect(String s) { + public String[] sentDetect(CharSequence s) { return getSD().sentDetect(s); } @Override - public Span[] sentPosDetect(String s) { + public Span[] sentPosDetect(CharSequence s) { return getSD().sentPosDetect(s); } - - public double[] getSentenceProbabilities() { - return getSD().getSentenceProbabilities(); - } } diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java index ec41b1bf9..5be435295 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java @@ -17,6 +17,11 @@ package opennlp.tools.eval; +import java.io.File; +import java.io.IOException; + +import org.junit.jupiter.api.Test; + import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTaggerME_TS; import opennlp.tools.sentdetect.SentenceDetectorME_TS; @@ -25,29 +30,24 @@ import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.Span; -import org.junit.Test; - -import java.io.File; -import java.io.IOException; - /** - * Test the the reentrant tools implementations are really thread safe by running the concurrently. + * Test the reentrant tools implementations are really thread safe by running concurrently. * Replace the thread-safe versions with the non-safe versions to see this test case fail. */ -public class MultiThreadedToolsEval { +public class MultiThreadedToolsEval extends AbstractEvalTest { @Test public void runMEToolsMultiThreaded() throws IOException, InterruptedException { - File sModelFile = new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-sent.bin"); + File sModelFile = new File(getOpennlpDataDir(), "models-sf/en-sent.bin"); SentenceModel sModel = new SentenceModel(sModelFile); SentenceDetectorME_TS sentencer = new SentenceDetectorME_TS(sModel); - File tModelFile = new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-token.bin"); + File tModelFile = new File(getOpennlpDataDir(), "models-sf/en-token.bin"); TokenizerModel tModel = new TokenizerModel(tModelFile); TokenizerME_TS tokenizer = new TokenizerME_TS(tModel); - File pModelFile = new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-maxent.bin"); + File pModelFile = new File(getOpennlpDataDir(), "models-sf/en-pos-maxent.bin"); POSModel pModel = new POSModel(pModelFile); POSTaggerME_TS tagger = new POSTaggerME_TS(pModel); From 02e282e659fcc3a85f99bf6f7bb6118430734b2d Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Tue, 1 Oct 2024 13:53:44 +0200 Subject: [PATCH 3/3] Applies comments regarding class names --- ...OSTaggerME_TS.java => ThreadSafePOSTaggerME.java} | 12 +++++++----- ...rME_TS.java => ThreadSafeSentenceDetectorME.java} | 12 +++++++----- ...okenizerME_TS.java => ThreadSafeTokenizerME.java} | 10 ++++++---- .../opennlp/tools/eval/MultiThreadedToolsEval.java | 12 ++++++------ 4 files changed, 26 insertions(+), 20 deletions(-) rename opennlp-tools/src/main/java/opennlp/tools/postag/{POSTaggerME_TS.java => ThreadSafePOSTaggerME.java} (85%) rename opennlp-tools/src/main/java/opennlp/tools/sentdetect/{SentenceDetectorME_TS.java => ThreadSafeSentenceDetectorME.java} (83%) rename opennlp-tools/src/main/java/opennlp/tools/tokenize/{TokenizerME_TS.java => ThreadSafeTokenizerME.java} (84%) diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME_TS.java b/opennlp-tools/src/main/java/opennlp/tools/postag/ThreadSafePOSTaggerME.java similarity index 85% rename from opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME_TS.java rename to opennlp-tools/src/main/java/opennlp/tools/postag/ThreadSafePOSTaggerME.java index c6e77017e..52419ddfc 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME_TS.java +++ b/opennlp-tools/src/main/java/opennlp/tools/postag/ThreadSafePOSTaggerME.java @@ -17,24 +17,26 @@ package opennlp.tools.postag; +import opennlp.tools.commons.ThreadSafe; import opennlp.tools.util.Sequence; /** * A thread-safe version of the POSTaggerME. Using it is completely transparent. You can use it in * a single-threaded context as well, it only incurs a minimal overhead. */ -public class POSTaggerME_TS implements POSTagger { +@ThreadSafe +public class ThreadSafePOSTaggerME implements POSTagger { - private POSModel model; + private final POSModel model; - private ThreadLocal threadLocal = new ThreadLocal<>(); + private final ThreadLocal threadLocal = new ThreadLocal<>(); - public POSTaggerME_TS(POSModel model) { + public ThreadSafePOSTaggerME(POSModel model) { super(); this.model = model; } - private final POSTaggerME getTagger() { + private POSTaggerME getTagger() { POSTaggerME tagger = threadLocal.get(); if (tagger == null) { tagger = new POSTaggerME(model); diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME_TS.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java similarity index 83% rename from opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME_TS.java rename to opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java index 0f9a3f7a6..99abc6fb4 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME_TS.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java @@ -17,6 +17,7 @@ package opennlp.tools.sentdetect; +import opennlp.tools.commons.ThreadSafe; import opennlp.tools.util.Span; /** @@ -27,19 +28,20 @@ * lightweight as the model is not duplicated, if you have many long-running threads, you may run * into memory issues. Be careful when you use this in a JEE application, for example. */ -public class SentenceDetectorME_TS implements SentenceDetector { +@ThreadSafe +public class ThreadSafeSentenceDetectorME implements SentenceDetector { - private SentenceModel model; + private final SentenceModel model; - private ThreadLocal sentenceDetectorThreadLocal = + private final ThreadLocal sentenceDetectorThreadLocal = new ThreadLocal<>(); - public SentenceDetectorME_TS(SentenceModel model) { + public ThreadSafeSentenceDetectorME(SentenceModel model) { super(); this.model = model; } - // If a thread-local version exists, return it. Otherwise create, then return. + // If a thread-local version exists, return it. Otherwise, create, then return. private SentenceDetectorME getSD() { SentenceDetectorME sd = sentenceDetectorThreadLocal.get(); if (sd == null) { diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME_TS.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java similarity index 84% rename from opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME_TS.java rename to opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java index 6c2ed3580..b92dd5e02 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME_TS.java +++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java @@ -17,19 +17,21 @@ package opennlp.tools.tokenize; +import opennlp.tools.commons.ThreadSafe; import opennlp.tools.util.Span; /** * A thread-safe version of TokenizerME. Using it is completely transparent. You can use it in * a single-threaded context as well, it only incurs a minimal overhead. */ -public class TokenizerME_TS implements Tokenizer { +@ThreadSafe +public class ThreadSafeTokenizerME implements Tokenizer { - private TokenizerModel model; + private final TokenizerModel model; - private ThreadLocal tokenizerThreadLocal = new ThreadLocal(); + private final ThreadLocal tokenizerThreadLocal = new ThreadLocal<>(); - public TokenizerME_TS(TokenizerModel model) { + public ThreadSafeTokenizerME(TokenizerModel model) { super(); this.model = model; } diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java index 5be435295..fcb2bfa90 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java @@ -23,10 +23,10 @@ import org.junit.jupiter.api.Test; import opennlp.tools.postag.POSModel; -import opennlp.tools.postag.POSTaggerME_TS; -import opennlp.tools.sentdetect.SentenceDetectorME_TS; +import opennlp.tools.postag.ThreadSafePOSTaggerME; import opennlp.tools.sentdetect.SentenceModel; -import opennlp.tools.tokenize.TokenizerME_TS; +import opennlp.tools.sentdetect.ThreadSafeSentenceDetectorME; +import opennlp.tools.tokenize.ThreadSafeTokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.Span; @@ -41,15 +41,15 @@ public void runMEToolsMultiThreaded() throws IOException, InterruptedException { File sModelFile = new File(getOpennlpDataDir(), "models-sf/en-sent.bin"); SentenceModel sModel = new SentenceModel(sModelFile); - SentenceDetectorME_TS sentencer = new SentenceDetectorME_TS(sModel); + ThreadSafeSentenceDetectorME sentencer = new ThreadSafeSentenceDetectorME(sModel); File tModelFile = new File(getOpennlpDataDir(), "models-sf/en-token.bin"); TokenizerModel tModel = new TokenizerModel(tModelFile); - TokenizerME_TS tokenizer = new TokenizerME_TS(tModel); + ThreadSafeTokenizerME tokenizer = new ThreadSafeTokenizerME(tModel); File pModelFile = new File(getOpennlpDataDir(), "models-sf/en-pos-maxent.bin"); POSModel pModel = new POSModel(pModelFile); - POSTaggerME_TS tagger = new POSTaggerME_TS(pModel); + ThreadSafePOSTaggerME tagger = new ThreadSafePOSTaggerME(pModel); final String text = "All human beings are born free and equal in dignity and rights. They " + "are endowed with reason and conscience and should act towards one another in a " +