From 470b979a78709205983b118c572767e2be64f2f9 Mon Sep 17 00:00:00 2001 From: Martin Ledvinka Date: Mon, 24 Jun 2024 17:08:57 +0200 Subject: [PATCH] [kbss-cvut/termit-ui#461] Make text quote selector context length configurable. --- .../document/html/HtmlSelectorGenerators.java | 11 +++++++--- .../html/TextQuoteSelectorGenerator.java | 17 ++++++++------- .../cvut/kbss/termit/util/Configuration.java | 21 ++++++++++++++++--- .../environment/config/TestServiceConfig.java | 5 +++-- .../html/HtmlTermOccurrenceResolverTest.java | 2 +- .../html/TextQuoteSelectorGeneratorTest.java | 9 +++++--- 6 files changed, 45 insertions(+), 20 deletions(-) diff --git a/src/main/java/cz/cvut/kbss/termit/service/document/html/HtmlSelectorGenerators.java b/src/main/java/cz/cvut/kbss/termit/service/document/html/HtmlSelectorGenerators.java index 8d0779d5b..bf09c6ba2 100644 --- a/src/main/java/cz/cvut/kbss/termit/service/document/html/HtmlSelectorGenerators.java +++ b/src/main/java/cz/cvut/kbss/termit/service/document/html/HtmlSelectorGenerators.java @@ -18,10 +18,10 @@ package cz.cvut.kbss.termit.service.document.html; import cz.cvut.kbss.termit.model.selector.Selector; +import cz.cvut.kbss.termit.util.Configuration; import org.jsoup.nodes.Element; import org.springframework.stereotype.Service; -import java.util.Arrays; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -37,8 +37,13 @@ @Service public class HtmlSelectorGenerators { - private final List generators = Arrays - .asList(new TextQuoteSelectorGenerator(), new TextPositionSelectorGenerator()); + private final List generators; + + public HtmlSelectorGenerators(Configuration config) { + this.generators = List.of( + new TextQuoteSelectorGenerator(config.getTextAnalysis().getTextQuoteSelectorContextLength()), + new TextPositionSelectorGenerator()); + } /** * Generates selectors for the specified HTML/XML elements. diff --git a/src/main/java/cz/cvut/kbss/termit/service/document/html/TextQuoteSelectorGenerator.java b/src/main/java/cz/cvut/kbss/termit/service/document/html/TextQuoteSelectorGenerator.java index 313449d6a..a0dacbb03 100644 --- a/src/main/java/cz/cvut/kbss/termit/service/document/html/TextQuoteSelectorGenerator.java +++ b/src/main/java/cz/cvut/kbss/termit/service/document/html/TextQuoteSelectorGenerator.java @@ -33,10 +33,11 @@ */ class TextQuoteSelectorGenerator implements SelectorGenerator { - /** - * Length of the generated prefix and suffix - */ - static final int CONTEXT_LENGTH = 32; + private final int contextLength; + + TextQuoteSelectorGenerator(int contextLength) { + this.contextLength = contextLength; + } @Override public TextQuoteSelector generateSelector(Element... elements) { @@ -55,12 +56,12 @@ private Optional extractPrefix(Element start) { current = current.parent(); final List previousSiblings = current.childNodes().subList(0, previous.siblingIndex()); sb = extractNodeText(previousSiblings).append(sb); - if (sb.length() >= CONTEXT_LENGTH) { + if (sb.length() >= contextLength) { break; } previous = current; } - return !sb.isEmpty() ? Optional.of(sb.substring(Math.max(0, sb.length() - CONTEXT_LENGTH))) : Optional.empty(); + return !sb.isEmpty() ? Optional.of(sb.substring(Math.max(0, sb.length() - contextLength))) : Optional.empty(); } private Optional extractSuffix(Element end) { @@ -72,11 +73,11 @@ private Optional extractSuffix(Element end) { final List previousSiblings = current.childNodes() .subList(previous.siblingIndex() + 1, current.childNodeSize()); sb.append(extractNodeText(previousSiblings)); - if (sb.length() >= CONTEXT_LENGTH) { + if (sb.length() >= contextLength) { break; } previous = current; } - return !sb.isEmpty() ? Optional.of(sb.substring(0, Math.min(sb.length(), CONTEXT_LENGTH))) : Optional.empty(); + return !sb.isEmpty() ? Optional.of(sb.substring(0, Math.min(sb.length(), contextLength))) : Optional.empty(); } } diff --git a/src/main/java/cz/cvut/kbss/termit/util/Configuration.java b/src/main/java/cz/cvut/kbss/termit/util/Configuration.java index beeb1a410..1c624b46a 100644 --- a/src/main/java/cz/cvut/kbss/termit/util/Configuration.java +++ b/src/main/java/cz/cvut/kbss/termit/util/Configuration.java @@ -19,6 +19,7 @@ import cz.cvut.kbss.termit.model.acl.AccessLevel; import jakarta.validation.Valid; +import jakarta.validation.constraints.Min; import jakarta.validation.constraints.NotNull; import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.context.annotation.Primary; @@ -570,13 +571,19 @@ public static class TextAnalysis { /** * URL of the text analysis service. */ - String url; + private String url; /** * Score threshold for a term occurrence for it to be saved into the repository. */ @NotNull - String termOccurrenceMinScore = Constants.SCORE_THRESHOLD.toString(); + private String termOccurrenceMinScore = Constants.SCORE_THRESHOLD.toString(); + + /** + * Maximum of the prefix and suffix of a text quote selector. + */ + @Min(8) + private int textQuoteSelectorContextLength = 32; public String getUrl() { return url; @@ -593,6 +600,14 @@ public String getTermOccurrenceMinScore() { public void setTermOccurrenceMinScore(String termOccurrenceMinScore) { this.termOccurrenceMinScore = termOccurrenceMinScore; } + + public int getTextQuoteSelectorContextLength() { + return textQuoteSelectorContextLength; + } + + public void setTextQuoteSelectorContextLength(int textQuoteSelectorContextLength) { + this.textQuoteSelectorContextLength = textQuoteSelectorContextLength; + } } @Validated @@ -601,7 +616,7 @@ public static class Glossary { * IRI path to append to vocabulary IRI to get glossary identifier. */ @NotNull - String fragment; + private String fragment; public String getFragment() { return fragment; diff --git a/src/test/java/cz/cvut/kbss/termit/environment/config/TestServiceConfig.java b/src/test/java/cz/cvut/kbss/termit/environment/config/TestServiceConfig.java index bb0cc6221..41874802e 100644 --- a/src/test/java/cz/cvut/kbss/termit/environment/config/TestServiceConfig.java +++ b/src/test/java/cz/cvut/kbss/termit/environment/config/TestServiceConfig.java @@ -25,6 +25,7 @@ import cz.cvut.kbss.termit.model.selector.Selector; import cz.cvut.kbss.termit.service.document.html.DummySelectorGenerator; import cz.cvut.kbss.termit.service.document.html.HtmlSelectorGenerators; +import cz.cvut.kbss.termit.util.Configuration; import org.aspectj.lang.Aspects; import org.jsoup.nodes.Element; import org.springframework.boot.test.context.TestConfiguration; @@ -76,8 +77,8 @@ public LocalValidatorFactoryBean validatorFactoryBean() { @Bean @Primary - public HtmlSelectorGenerators htmlSelectorGenerators() { - return new HtmlSelectorGenerators() { + public HtmlSelectorGenerators htmlSelectorGenerators(Configuration configuration) { + return new HtmlSelectorGenerators(configuration) { @Override public Set generateSelectors(Element... elements) { return Collections.singleton(new DummySelectorGenerator().generateSelector(elements)); diff --git a/src/test/java/cz/cvut/kbss/termit/service/document/html/HtmlTermOccurrenceResolverTest.java b/src/test/java/cz/cvut/kbss/termit/service/document/html/HtmlTermOccurrenceResolverTest.java index d5af0edaa..627a24feb 100644 --- a/src/test/java/cz/cvut/kbss/termit/service/document/html/HtmlTermOccurrenceResolverTest.java +++ b/src/test/java/cz/cvut/kbss/termit/service/document/html/HtmlTermOccurrenceResolverTest.java @@ -73,7 +73,7 @@ class HtmlTermOccurrenceResolverTest { @SuppressWarnings("unused") @Spy - private HtmlSelectorGenerators selectorGenerators = new HtmlSelectorGenerators(); + private HtmlSelectorGenerators selectorGenerators = new HtmlSelectorGenerators(config); @Mock private DocumentManager documentManager; diff --git a/src/test/java/cz/cvut/kbss/termit/service/document/html/TextQuoteSelectorGeneratorTest.java b/src/test/java/cz/cvut/kbss/termit/service/document/html/TextQuoteSelectorGeneratorTest.java index ac88220b8..809953279 100644 --- a/src/test/java/cz/cvut/kbss/termit/service/document/html/TextQuoteSelectorGeneratorTest.java +++ b/src/test/java/cz/cvut/kbss/termit/service/document/html/TextQuoteSelectorGeneratorTest.java @@ -25,11 +25,14 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import static cz.cvut.kbss.termit.service.document.html.TextQuoteSelectorGenerator.CONTEXT_LENGTH; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; class TextQuoteSelectorGeneratorTest { + private static final int CONTEXT_LENGTH = 32; + private TextQuoteSelectorGenerator sut; private Document document; @@ -37,7 +40,7 @@ class TextQuoteSelectorGeneratorTest { @BeforeEach void setUp() { this.document = new Document(""); - this.sut = new TextQuoteSelectorGenerator(); + this.sut = new TextQuoteSelectorGenerator(CONTEXT_LENGTH); } @Test