diff --git a/pom.xml b/pom.xml index f4dd850..33d2950 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ org.icatproject icat.lucene - 2.0.3-SNAPSHOT + 3.0.0-SNAPSHOT war ICAT Lucene @@ -14,7 +14,7 @@ https://repo.icatproject.org/repo github https://github.com/icatproject/icat.lucene - 5.5.5 + 8.11.2 @@ -86,6 +86,12 @@ ${luceneVersion} + + org.apache.lucene + lucene-facet + ${luceneVersion} + + org.apache.lucene lucene-backward-codecs @@ -102,7 +108,7 @@ org.icatproject icat.utils - 4.16.1 + 4.17.0-SNAPSHOT @@ -330,6 +336,3 @@ Exposes lucene calls to an icat server - - - diff --git a/src/main/config/run.properties.example b/src/main/config/run.properties.example index b010790..7702881 100644 --- a/src/main/config/run.properties.example +++ b/src/main/config/run.properties.example @@ -1,6 +1,16 @@ # Real comments in this file are marked with '#' whereas commented out lines # are marked with '!' -directory = ${HOME}/data/lucene -commitSeconds = 5 -ip = 127.0.0.1/32 +directory = ${HOME}/data/search +commitSeconds = 5 +maxShardSize = 2147483648 +ip = 127.0.0.1/32 +# A search taking longer than this will be cancelled to avoid blocking other users' searches +maxSearchTimeSeconds = 5 +# List of units to enable conversion to SI units when querying on numerical parameters +!units = J: eV 1.602176634e-19; \u2103: celsius, degC; K: kelvin +# List of fields that should be stored for facet filtering when searching +# In order to be available, these fields must be set when indexing the data +facetFields = datafileFormat.name instrument.name sample.type.name stringValue technique.name type.name +# Aggregate file sizes and counts in real time (this will have a performance impact on write operations) +aggregateFiles = false diff --git a/src/main/java/org/icatproject/lucene/DocumentMapping.java b/src/main/java/org/icatproject/lucene/DocumentMapping.java new file mode 100644 index 0000000..42f0e87 --- /dev/null +++ b/src/main/java/org/icatproject/lucene/DocumentMapping.java @@ -0,0 +1,127 @@ +package org.icatproject.lucene; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; +import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; +import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys; + +public class DocumentMapping { + + /** + * Represents the parent child relationship between two ICAT entities. + */ + public static class ParentRelationship { + public String parentName; + public String joiningField; + public Set fields; + + /** + * @param parentName Name of the parent entity. + * @param joiningField Field that joins the child to its parent. + * @param fields Fields that should be updated by this relationship. + */ + public ParentRelationship(String parentName, String joiningField, String... fields) { + this.parentName = parentName; + this.joiningField = joiningField; + this.fields = new HashSet<>(Arrays.asList(fields)); + } + } + + private static Analyzer analyzer = new IcatSynonymAnalyzer();; + + public static final Set doubleFields = new HashSet<>(); + public static final Set longFields = new HashSet<>(); + public static final Set sortFields = new HashSet<>(); + public static final Set textFields = new HashSet<>(); + public static final Set indexedEntities = new HashSet<>(); + public static final Map relationships = new HashMap<>(); + + public static final StandardQueryParser genericParser = buildParser(); + public static final StandardQueryParser datafileParser = buildParser("name", "description", "location", + "datafileFormat.name", "visitId", "sample.name", "sample.type.name", "doi"); + public static final StandardQueryParser datasetParser = buildParser("name", "description", "sample.name", + "sample.type.name", "type.name", "visitId", "doi"); + public static final StandardQueryParser investigationParser = buildParser("name", "visitId", "title", "summary", + "facility.name", "type.name", "doi"); + public static final StandardQueryParser sampleParser = buildParser("sample.name", "sample.type.name"); + + static { + doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI", "rangeTop", "rangeTopSI", "rangeBottom", + "rangeBottomSI")); + longFields.addAll( + Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate", "fileSize", + "fileCount", "datafile.id", "datafileFormat.id", "dataset.id", "facility.id", + "facilityCycle.id", "investigation.id", "instrument.id", "id", "sample.id", + "sample.investigation.id", "sample.type.id", "technique.id", "type.id", "user.id")); + sortFields.addAll( + Arrays.asList("datafile.id", "datafileFormat.id", "dataset.id", "facility.id", "facilityCycle.id", + "investigation.id", "instrument.id", "id", "sample.id", "sample.investigation.id", + "technique.id", "type.id", "user.id", "date", "name", "stringValue", "dateTimeValue", + "numericValue", "numericValueSI", "fileSize", "fileCount")); + textFields.addAll(Arrays.asList("name", "visitId", "description", "location", "dataset.name", + "investigation.name", "instrument.name", "instrument.fullName", "datafileFormat.name", "sample.name", + "sample.type.name", "technique.name", "technique.description", "technique.pid", "title", "summary", + "facility.name", "user.fullName", "type.name", "doi")); + + indexedEntities.addAll(Arrays.asList("Datafile", "Dataset", "Investigation", "DatafileParameter", + "DatasetParameter", "DatasetTechnique", "InstrumentScientist", "InvestigationFacilityCycle", + "InvestigationInstrument", "InvestigationParameter", "InvestigationUser", "Sample", "SampleParameter")); + + relationships.put("Instrument", + new ParentRelationship[] { new ParentRelationship("InvestigationInstrument", "instrument.id", + "instrument.name", "instrument.fullName") }); + relationships.put("User", + new ParentRelationship[] { + new ParentRelationship("InvestigationUser", "user.id", "user.name", "user.fullName"), + new ParentRelationship("InstrumentScientist", "user.id", "user.name", "user.fullName") }); + relationships.put("Sample", new ParentRelationship[] { + new ParentRelationship("Dataset", "sample.id", "sample.name", "sample.investigation.id"), + new ParentRelationship("Datafile", "sample.id", "sample.name", "sample.investigation.id") }); + relationships.put("SampleType", + new ParentRelationship[] { new ParentRelationship("Sample", "type.id", "type.name"), + new ParentRelationship("Dataset", "sample.type.id", "sample.type.name"), + new ParentRelationship("Datafile", "sample.type.id", "sample.type.name") }); + relationships.put("InvestigationType", + new ParentRelationship[] { new ParentRelationship("Investigation", "type.id", "type.name") }); + relationships.put("DatasetType", + new ParentRelationship[] { new ParentRelationship("Dataset", "type.id", "type.name") }); + relationships.put("DatafileFormat", + new ParentRelationship[] { + new ParentRelationship("Datafile", "datafileFormat.id", "datafileFormat.name") }); + relationships.put("Facility", + new ParentRelationship[] { new ParentRelationship("Investigation", "facility.id", "facility.name") }); + relationships.put("ParameterType", + new ParentRelationship[] { new ParentRelationship("DatafileParameter", "type.id", "type.name"), + new ParentRelationship("DatasetParameter", "type.id", "type.name"), + new ParentRelationship("InvestigationParameter", "type.id", "type.name"), + new ParentRelationship("SampleParameter", "type.id", "type.name") }); + relationships.put("Technique", + new ParentRelationship[] { new ParentRelationship("DatasetTechnique", "technique.id", "technique.name", + "technique.description", "technique.pid") }); + relationships.put("Investigation", + new ParentRelationship[] { + new ParentRelationship("Dataset", "investigation.id", "investigation.name", + "investigation.title", "investigation.startDate", "visitId"), + new ParentRelationship("datafile", "investigation.id", "investigation.name", "visitId") }); + relationships.put("Dataset", + new ParentRelationship[] { new ParentRelationship("Datafile", "dataset.id", "dataset.name") }); + } + + private static StandardQueryParser buildParser(String... defaultFields) { + StandardQueryParser parser = new StandardQueryParser(); + StandardQueryConfigHandler qpConf = (StandardQueryConfigHandler) parser.getQueryConfigHandler(); + qpConf.set(ConfigurationKeys.ANALYZER, analyzer); + qpConf.set(ConfigurationKeys.ALLOW_LEADING_WILDCARD, true); + if (defaultFields.length > 0) { + qpConf.set(ConfigurationKeys.MULTI_FIELDS, defaultFields); + } + + return parser; + } +} diff --git a/src/main/java/org/icatproject/lucene/FacetedDimension.java b/src/main/java/org/icatproject/lucene/FacetedDimension.java new file mode 100644 index 0000000..bfd1e7f --- /dev/null +++ b/src/main/java/org/icatproject/lucene/FacetedDimension.java @@ -0,0 +1,109 @@ +package org.icatproject.lucene; + +import java.util.ArrayList; +import java.util.List; + +import jakarta.json.Json; +import jakarta.json.JsonObjectBuilder; + +import org.apache.lucene.facet.FacetResult; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.range.DoubleRange; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.facet.range.Range; + +/** + * For a single dimension (field), stores labels (the unique values or ranges of + * values for that field in the index) and their respective counts (the number + * of times that label appears in different documents). + * + * For example, a dimension might be "colour", the label "red", and the count 5. + */ +public class FacetedDimension { + + private String dimension; + private List ranges; + private List labels; + private List counts; + + /** + * Creates an "empty" FacetedDimension. The dimension (field) is set but ranges, + * labels and counts are not. + * + * @param dimension The dimension, or field, to be faceted + */ + public FacetedDimension(String dimension) { + this.dimension = dimension; + this.ranges = new ArrayList<>(); + this.labels = new ArrayList<>(); + this.counts = new ArrayList<>(); + } + + /** + * Extracts the count for each label in the FacetResult. If the label has + * already been encountered, the count is incremented rather than being + * overridden. Essentially, this allows faceting to be performed across multiple + * shards. + * + * @param facetResult A Lucene FacetResult object corresponding the relevant + * dimension + */ + public void addResult(FacetResult facetResult) { + for (LabelAndValue labelAndValue : facetResult.labelValues) { + String label = labelAndValue.label; + int labelIndex = labels.indexOf(label); + if (labelIndex == -1) { + labels.add(label); + counts.add(labelAndValue.value.longValue()); + } else { + counts.set(labelIndex, counts.get(labelIndex) + labelAndValue.value.longValue()); + } + } + } + + /** + * Formats the labels and counts into Json. + * + * @param aggregationsBuilder The JsonObjectBuilder to add the facets for this + * dimension to. + */ + public void buildResponse(JsonObjectBuilder aggregationsBuilder) { + JsonObjectBuilder bucketsBuilder = Json.createObjectBuilder(); + for (int i = 0; i < labels.size(); i++) { + JsonObjectBuilder bucketBuilder = Json.createObjectBuilder(); + bucketBuilder.add("doc_count", counts.get(i)); + if (ranges.size() > i) { + Range range = ranges.get(i); + if (range.getClass().getSimpleName().equals("LongRange")) { + bucketBuilder.add("from", ((LongRange) range).min); + bucketBuilder.add("to", ((LongRange) range).max); + } else if (range.getClass().getSimpleName().equals("DoubleRange")) { + bucketBuilder.add("from", ((DoubleRange) range).min); + bucketBuilder.add("to", ((DoubleRange) range).max); + } + } + bucketsBuilder.add(labels.get(i), bucketBuilder); + } + aggregationsBuilder.add(dimension, Json.createObjectBuilder().add("buckets", bucketsBuilder)); + } + + /** + * @return The list of Lucene Range Objects for use with numerical facets. + * For String faceting, this will be empty. + */ + public List getRanges() { + return ranges; + } + + /** + * @return The dimension that these labels and counts correspond to. + */ + public String getDimension() { + return dimension; + } + + public String toString() { + return dimension + ": " + labels + ", " + counts; + } + +} diff --git a/src/main/java/org/icatproject/lucene/Field.java b/src/main/java/org/icatproject/lucene/Field.java new file mode 100644 index 0000000..ad24647 --- /dev/null +++ b/src/main/java/org/icatproject/lucene/Field.java @@ -0,0 +1,184 @@ +package org.icatproject.lucene; + +import jakarta.json.JsonObject; + +import java.util.List; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoublePoint; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.NumericUtils; + +/** + * Wrapper for the name, value and type (String/Text, long, double) of a field + * to be added to a Lucene Document. + */ +class Field { + + private abstract class InnerField { + + public abstract void addSortable(Document document) throws NumberFormatException; + + public abstract void addToDocument(Document document) throws NumberFormatException; + + } + + private class InnerStringField extends InnerField { + + private String value; + + public InnerStringField(String value) { + this.value = value; + } + + @Override + public void addSortable(Document document) throws NumberFormatException { + if (DocumentMapping.sortFields.contains(name)) { + document.add(new SortedDocValuesField(name, new BytesRef(value))); + } + } + + @Override + public void addToDocument(Document document) throws NumberFormatException { + addSortable(document); + + if (facetable) { + document.add(new SortedSetDocValuesFacetField(name + ".keyword", value)); + document.add(new StringField(name + ".keyword", value, Store.NO)); + } + + if (DocumentMapping.textFields.contains(name)) { + document.add(new TextField(name, value, Store.YES)); + } else { + document.add(new StringField(name, value, Store.YES)); + } + + } + + } + + private class InnerLongField extends InnerField { + + private long value; + + public InnerLongField(long value) { + this.value = value; + } + + @Override + public void addSortable(Document document) throws NumberFormatException { + if (DocumentMapping.sortFields.contains(name)) { + document.add(new NumericDocValuesField(name, value)); + } + } + + @Override + public void addToDocument(Document document) throws NumberFormatException { + addSortable(document); + document.add(new LongPoint(name, value)); + document.add(new StoredField(name, value)); + } + + } + + private class InnerDoubleField extends InnerField { + + private double value; + + public InnerDoubleField(double value) { + this.value = value; + } + + @Override + public void addSortable(Document document) throws NumberFormatException { + if (DocumentMapping.sortFields.contains(name)) { + long sortableLong = NumericUtils.doubleToSortableLong(value); + document.add(new NumericDocValuesField(name, sortableLong)); + } + } + + @Override + public void addToDocument(Document document) throws NumberFormatException { + addSortable(document); + document.add(new DoublePoint(name, value)); + document.add(new StoredField(name, value)); + } + + } + + private String name; + private InnerField innerField; + private boolean facetable; + + /** + * Creates a wrapper for a Field. + * + * @param object JsonObject containing representations of multiple fields + * @param key Key of a specific field in object + * @param facetFields List of String field names which should be stored as a facetable keyword + */ + public Field(JsonObject object, String key, List facetFields) { + name = key; + facetable = facetFields.contains(name); + if (DocumentMapping.doubleFields.contains(name)) { + innerField = new InnerDoubleField(object.getJsonNumber(name).doubleValue()); + } else if (DocumentMapping.longFields.contains(name)) { + innerField = new InnerLongField(object.getJsonNumber(name).longValueExact()); + } else { + innerField = new InnerStringField(object.getString(name)); + } + } + + /** + * Creates a wrapper for a Field. + * + * @param indexableField A Lucene IndexableField + * @param facetFields List of String fields which should be stored as a facetable keyword + */ + public Field(IndexableField indexableField, List facetFields) { + name = indexableField.name(); + facetable = facetFields.contains(name); + if (DocumentMapping.doubleFields.contains(name)) { + innerField = new InnerDoubleField(indexableField.numericValue().doubleValue()); + } else if (DocumentMapping.longFields.contains(name)) { + innerField = new InnerLongField(indexableField.numericValue().longValue()); + } else { + innerField = new InnerStringField(indexableField.stringValue()); + } + } + + /** + * Adds a sortable field to the passed document. This only accounts for sorting, + * if storage and searchability are also needed, see {@link #addToDocument}. The + * exact implementation depends on whether this is a String, long or double + * field. + * + * @param document The document to add to + * @throws NumberFormatException + */ + public void addSortable(Document document) throws NumberFormatException { + innerField.addSortable(document); + } + + /** + * Adds this field to the passed document. This accounts for sortable and + * facetable fields. The exact implementation depends on whether this is a + * String, long or double field. + * + * @param document The document to add to + * @throws NumberFormatException + */ + public void addToDocument(Document document) throws NumberFormatException { + innerField.addToDocument(document); + } + +} diff --git a/src/main/java/org/icatproject/lucene/IcatAnalyzer.java b/src/main/java/org/icatproject/lucene/IcatAnalyzer.java old mode 100644 new mode 100755 index cb6767e..7494b84 --- a/src/main/java/org/icatproject/lucene/IcatAnalyzer.java +++ b/src/main/java/org/icatproject/lucene/IcatAnalyzer.java @@ -1,25 +1,41 @@ package org.icatproject.lucene; +import java.util.Arrays; +import java.util.List; + import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; -import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.PorterStemFilter; -import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; public class IcatAnalyzer extends Analyzer { + public static final CharArraySet SCIENTIFIC_STOP_WORDS_SET; + + /** + * Do not include (As At Be In No) in the stop words as these are chemical + * symbols. Otherwise, the set should match Lucene's ENGLISH_STOP_WORDS_SET + */ + static { + final List stopWords = + Arrays.asList("a", "an", "and", "are", "but", "by", "for", "if", "into", "is", + "it", "not", "on", "or", "such", "that", "the", "their", "then", + "there", "these", "they", "this", "to", "was", "will", "with"); + final CharArraySet stopSet = new CharArraySet(stopWords, false); + SCIENTIFIC_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); + } + @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); - TokenStream sink = new StandardFilter(source); - sink = new EnglishPossessiveFilter(sink); + TokenStream sink = new EnglishPossessiveFilter(source); sink = new LowerCaseFilter(sink); - sink = new StopFilter(sink, StopAnalyzer.ENGLISH_STOP_WORDS_SET); + sink = new StopFilter(sink, SCIENTIFIC_STOP_WORDS_SET); sink = new PorterStemFilter(sink); return new TokenStreamComponents(source, sink); } diff --git a/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java b/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java new file mode 100755 index 0000000..029f8fc --- /dev/null +++ b/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java @@ -0,0 +1,54 @@ +package org.icatproject.lucene; + +import java.io.BufferedReader; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.IOException; +import java.text.ParseException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.en.EnglishPossessiveFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.synonym.SolrSynonymParser; +import org.apache.lucene.analysis.synonym.SynonymGraphFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; + +public class IcatSynonymAnalyzer extends Analyzer { + + private SynonymMap synonyms; + + public IcatSynonymAnalyzer() { + super(); + // Load synonyms from resource file + InputStream in = IcatSynonymAnalyzer.class.getClassLoader().getResourceAsStream("synonym.txt"); + if (in != null) { + BufferedReader reader = new BufferedReader(new InputStreamReader(in)); + SolrSynonymParser parser = new SolrSynonymParser(true, true, new IcatAnalyzer()); + try { + parser.parse(reader); + synonyms = parser.build(); + } catch (IOException | ParseException e) { + // If we cannot parse the synonyms, do nothing + // To all purposes this will now act as a plain IcatAnalyzer + } + } + } + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer source = new StandardTokenizer(); + TokenStream sink = new EnglishPossessiveFilter(source); + sink = new LowerCaseFilter(sink); + sink = new StopFilter(sink, IcatAnalyzer.SCIENTIFIC_STOP_WORDS_SET); + sink = new PorterStemFilter(sink); + if (synonyms != null) { + sink = new SynonymGraphFilter(sink, synonyms, false); + } + return new TokenStreamComponents(source, sink); + } +} diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java old mode 100644 new mode 100755 index fc4dc2b..31efaea --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -6,31 +6,35 @@ import java.net.HttpURLConnection; import java.nio.file.FileVisitOption; import java.nio.file.Files; +import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Consumer; import jakarta.annotation.PostConstruct; import jakarta.annotation.PreDestroy; import jakarta.ejb.Singleton; import jakarta.json.Json; import jakarta.json.JsonArray; +import jakarta.json.JsonException; +import jakarta.json.JsonNumber; import jakarta.json.JsonObject; +import jakarta.json.JsonObjectBuilder; import jakarta.json.JsonReader; -import jakarta.json.JsonString; -import jakarta.json.JsonValue; +import jakarta.json.JsonStructure; import jakarta.json.stream.JsonGenerator; -import jakarta.json.stream.JsonParser; -import jakarta.json.stream.JsonParser.Event; import jakarta.servlet.http.HttpServletRequest; import jakarta.ws.rs.Consumes; -import jakarta.ws.rs.DELETE; import jakarta.ws.rs.GET; import jakarta.ws.rs.POST; import jakarta.ws.rs.Path; @@ -41,36 +45,50 @@ import jakarta.ws.rs.core.MediaType; import org.apache.lucene.document.Document; -import org.apache.lucene.document.DoubleField; +import org.apache.lucene.document.DoublePoint; import org.apache.lucene.document.Field.Store; -import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; +import org.apache.lucene.facet.FacetResult; +import org.apache.lucene.facet.Facets; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.range.DoubleRange; +import org.apache.lucene.facet.range.DoubleRangeFacetCounts; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.facet.range.LongRangeFacetCounts; +import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; -import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; -import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; -import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BooleanQuery.Builder; +import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.SearcherManager; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortField.Type; +import org.apache.lucene.search.TimeLimitingCollector.TimeExceededException; +import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.search.TimeLimitingCollector; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.WildcardQuery; -import org.apache.lucene.search.join.JoinUtil; -import org.apache.lucene.search.join.ScoreMode; +import org.apache.lucene.search.TopFieldCollector; +import org.apache.lucene.search.TopFieldDocs; +import org.apache.lucene.search.TotalHits; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Counter; +import org.apache.lucene.util.NumericUtils; +import org.icatproject.lucene.SearchBucket.SearchType; import org.icatproject.lucene.exceptions.LuceneException; import org.icatproject.utils.CheckedProperties; +import org.icatproject.utils.IcatUnits; +import org.icatproject.utils.IcatUnits.Value; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Marker; @@ -80,48 +98,291 @@ @Singleton public class Lucene { - enum AttributeName { - type, name, value, date, store - } + /** + * A bucket for accessing the read and write functionality for a single "shard" + * Lucene index which can then be grouped to represent a single document type. + */ + private class ShardBucket { + private FSDirectory directory; + private IndexWriter indexWriter; + private SearcherManager searcherManager; + private DefaultSortedSetDocValuesReaderState state; + private AtomicLong documentCount; + + /** + * Creates a bucket for accessing the read and write functionality for a single + * "shard" Lucene index which can then be grouped to represent a single document + * type. + * + * @param shardPath Path to the directory used as storage for this shard. + * @throws IOException + */ + public ShardBucket(java.nio.file.Path shardPath) throws IOException { + directory = FSDirectory.open(shardPath); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + indexWriter = new IndexWriter(directory, config); + String[] files = directory.listAll(); + if (files.length == 1 && files[0].equals("write.lock")) { + logger.debug("Directory only has the write.lock file so store and delete a dummy document"); + Document doc = new Document(); + doc.add(new StringField("dummy", "dummy", Store.NO)); + indexWriter.addDocument(facetsConfig.build(doc)); + indexWriter.commit(); + indexWriter.deleteDocuments(new Term("dummy", "dummy")); + indexWriter.commit(); + logger.debug("Now have " + indexWriter.getDocStats().numDocs + " documents indexed"); + } + searcherManager = new SearcherManager(indexWriter, null); + IndexSearcher indexSearcher = searcherManager.acquire(); + int numDocs = indexSearcher.getIndexReader().numDocs(); + documentCount = new AtomicLong(numDocs); + initState(indexSearcher); + logger.info("Created ShardBucket for directory {} with {} Documents", directory.getDirectory(), numDocs); + } + + /** + * Commits all pending cached documents to this shard. + * + * @return The number of documents committed to this shard. + * @throws IOException + */ + public int commit() throws IOException { + if (indexWriter.hasUncommittedChanges()) { + indexWriter.commit(); + searcherManager.maybeRefreshBlocking(); + initState(searcherManager.acquire()); + } + return indexWriter.numRamDocs(); + } - enum FieldType { - TextField, StringField, SortedDocValuesField, DoubleField + /** + * Creates a new DefaultSortedSetDocValuesReaderState object for this shard. + * This can be expensive for indices with a large number of faceted dimensions + * and labels, so should only be done when needed. + * + * @param indexSearcher The underlying reader of this searcher is used to build + * the state + * @throws IOException + */ + private void initState(IndexSearcher indexSearcher) throws IOException { + try { + state = new DefaultSortedSetDocValuesReaderState(indexSearcher.getIndexReader()); + } catch (IllegalArgumentException e) { + // This can occur if no fields in the index have been faceted, in which case set + // state to null to ensure we don't (erroneously) use the old state + logger.error( + "No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); + state = null; + } finally { + searcherManager.release(indexSearcher); + } + } } + /** + * A bucket for accessing the high level functionality, such as + * searching, for a single document type. Incoming documents will be routed to + * one of the individual "shard" indices that are grouped by this Object. + */ private class IndexBucket { - private FSDirectory directory; - private IndexWriter indexWriter; - private SearcherManager searcherManager; + private String entityName; + private List shardList = new ArrayList<>(); private AtomicBoolean locked = new AtomicBoolean(); - } - public class Search { - public Map map; - public Query query; - public ScoreDoc lastDoc; - } + /** + * Creates a bucket for accessing the high level functionality, such as + * searching, for a single document type. Incoming documents will be routed to + * one of the individual "shard" indices that are grouped by this Object. + * + * @param entityName The name of the entity that this index contains documents + * for. + */ + public IndexBucket(String entityName) { + try { + logger.trace("Initialising bucket for {}", entityName); + this.entityName = entityName.toLowerCase(); + Long shardIndex = 0L; + java.nio.file.Path shardPath = luceneDirectory.resolve(entityName); + ShardBucket shardBucket; + // Create at least one shard, then keep creating them so long as directories + // exist and already contain Documents + do { + shardBucket = new ShardBucket(shardPath); + shardList.add(shardBucket); + shardIndex++; + shardPath = luceneDirectory.resolve(entityName + "_" + shardIndex); + } while (shardBucket.documentCount.get() > 0 && Files.isDirectory(shardPath)); + logger.debug("Bucket for {} is now ready with {} shards", entityName, shardIndex); + } catch (Throwable e) { + logger.error("Can't continue " + e.getClass() + " " + e.getMessage()); + } + } - enum When { - Now, Sometime - } + /** + * Acquires IndexSearchers from the SearcherManagers of the individual shards in + * this bucket. + * + * @return List of IndexSearchers for all shards in this bucket. + * @throws IOException + */ + public List acquireSearchers() throws IOException { + List subSearchers = new ArrayList<>(); + for (ShardBucket shardBucket : shardList) { + logger.trace("Acquiring searcher for shard"); + subSearchers.add(shardBucket.searcherManager.acquire()); + } + return subSearchers; + } + + /** + * Adds a document to the appropriate shard for this index. + * + * @param document The document to be added. + * @throws IOException + */ + public void addDocument(Document document) throws IOException { + ShardBucket shardBucket = routeShard(); + shardBucket.indexWriter.addDocument(document); + shardBucket.documentCount.incrementAndGet(); + } + + /** + * Deletes a document from the appropriate shard for this index. + * + * @param icatId The ICAT id of the document to be deleted. + * @throws IOException + */ + public void deleteDocument(long icatId) throws IOException { + for (ShardBucket shardBucket : shardList) { + shardBucket.indexWriter.deleteDocuments(LongPoint.newExactQuery("id", icatId)); + } + } - private static final Logger logger = LoggerFactory.getLogger(Lucene.class); + /** + * Updates the document with the provided ICAT id. + * + * @param icatId The ICAT id of the document to be updated. + * @param document The document that will replace the old document. + * @throws IOException + */ + public void updateDocument(long icatId, Document document) throws IOException { + deleteDocument(icatId); + addDocument(document); + } + + /** + * Creates a new ShardBucket and stores it in the shardMap. + * + * @param shardKey The identifier for the new shard to be created. For + * simplicity, should an int starting at 0 and incrementing by 1 + * for each new shard. + * @return A new ShardBucket with the provided shardKey. + * @throws IOException + */ + public ShardBucket buildShardBucket(int shardKey) throws IOException { + ShardBucket shardBucket = new ShardBucket(luceneDirectory.resolve(entityName + "_" + shardKey)); + shardList.add(shardBucket); + return shardBucket; + } + + /** + * Commits Documents for writing on all "shard" indices for this bucket. + * + * @param command The high level command which called this function. Only + * used for debug logging. + * @param entityName The name of the entities being committed. Only used for + * debug logging. + * @throws IOException + */ + public void commit(String command, String entityName) throws IOException { + for (ShardBucket shardBucket : shardList) { + int cached = shardBucket.commit(); + if (cached != 0) { + int numDocs = shardBucket.indexWriter.getDocStats().numDocs; + String directoryName = shardBucket.directory.getDirectory().toString(); + logger.debug("{} has committed {} {} changes to Lucene - now have {} documents indexed in {}", + command, cached, entityName, numDocs, directoryName); + } + } + } + + /** + * Commits and closes all "shard" indices for this bucket. + * + * @throws IOException + */ + public void close() throws IOException { + for (ShardBucket shardBucket : shardList) { + shardBucket.searcherManager.close(); + shardBucket.indexWriter.commit(); + shardBucket.indexWriter.close(); + shardBucket.directory.close(); + } + } + + /** + * @return The ShardBucket currently in use for indexing new Documents. + */ + public ShardBucket getCurrentShardBucket() { + int size = shardList.size(); + return shardList.get(size - 1); + } + /** + * Provides the ShardBucket that should be used for writing the next Document. + * All Documents up to luceneMaxShardSize are indexed in the first shard, after + * that a new shard is created for the next luceneMaxShardSize Documents and so + * on. + * + * @return The ShardBucket that the relevant Document is/should be indexed in. + * @throws IOException + */ + public ShardBucket routeShard() throws IOException { + ShardBucket shardBucket = getCurrentShardBucket(); + if (shardBucket.documentCount.get() >= luceneMaxShardSize) { + shardBucket.indexWriter.commit(); + shardBucket = buildShardBucket(shardList.size()); + } + return shardBucket; + } + + /** + * Releases all provided searchers for the shards in this bucket. + * + * @param subSearchers List of IndexSearcher, in shard order. + * @throws IOException + * @throws LuceneException If the number of searchers and shards isn't the same. + */ + public void releaseSearchers(List subSearchers) throws IOException, LuceneException { + if (subSearchers.size() != shardList.size()) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, + "Was expecting the same number of DirectoryReaders as ShardBuckets, but had " + + subSearchers.size() + ", " + shardList.size() + " respectively."); + } + int i = 0; + for (ShardBucket shardBucket : shardList) { + shardBucket.searcherManager.release(subSearchers.get(i)); + i++; + } + } + } + + static final Logger logger = LoggerFactory.getLogger(Lucene.class); private static final Marker fatal = MarkerFactory.getMarker("FATAL"); + private static final IcatAnalyzer analyzer = new IcatAnalyzer(); - private java.nio.file.Path luceneDirectory; + private final FacetsConfig facetsConfig = new FacetsConfig(); + private java.nio.file.Path luceneDirectory; private int luceneCommitMillis; - - private AtomicLong bucketNum = new AtomicLong(); + private long luceneMaxShardSize; + private long maxSearchTimeSeconds; + private boolean aggregateFiles; private Map indexBuckets = new ConcurrentHashMap<>(); - private StandardQueryParser parser; - private Timer timer; - private IcatAnalyzer analyzer; - - private Map searches = new ConcurrentHashMap<>(); + public List facetFields = new ArrayList<>(); + public IcatUnits icatUnits; /** * return the version of the lucene server @@ -144,50 +405,28 @@ public String getVersion() { @Consumes(MediaType.APPLICATION_JSON) @Path("modify") public void modify(@Context HttpServletRequest request) throws LuceneException { - logger.debug("Requesting modify"); int count = 0; - - try (JsonParser parser = Json.createParser(request.getInputStream())) { - - Event ev = parser.next(); - if (ev != Event.START_ARRAY) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Unexpected " + ev.name()); - } - ev = parser.next(); - - while (true) { - if (ev == Event.END_ARRAY) { - break; - } - if (ev != Event.START_ARRAY) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Unexpected " + ev.name()); - } - ev = parser.next(); - String entityName = parser.getString(); - ev = parser.next(); - Long id = (ev == Event.VALUE_NULL) ? null : parser.getLong(); - ev = parser.next(); - if (ev == Event.VALUE_NULL) { - try { - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> createBucket(k)); - if (bucket.locked.get()) { - throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, - "Lucene locked for " + entityName); - } - bucket.indexWriter.deleteDocuments(new Term("id", Long.toString(id))); - } catch (IOException e) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } + try (JsonReader reader = Json.createReader(request.getInputStream())) { + List operations = reader.readArray().getValuesAs(JsonObject.class); + for (JsonObject operation : operations) { + if (operation.size() != 1) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Operation object should only have one key/value pair, but request had " + + operation.size()); + } else if (operation.containsKey("create")) { + create(operation.getJsonObject("create")); + } else if (operation.containsKey("update")) { + update(operation.getJsonObject("update")); + } else if (operation.containsKey("delete")) { + delete(operation.getJsonObject("delete")); } else { - add(request, entityName, When.Sometime, parser, id); + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Operation key should be one of 'create', 'update', 'delete', but it was " + + operation.keySet()); } - ev = parser.next(); // end of triple - count++; - ev = parser.next(); // either end of input or start of new - // triple } - + count = operations.size(); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -195,97 +434,6 @@ public void modify(@Context HttpServletRequest request) throws LuceneException { } - /* if id is not null this is actually an update */ - private void add(HttpServletRequest request, String entityName, When when, JsonParser parser, Long id) - throws LuceneException, IOException { - - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> createBucket(k)); - - AttributeName attName = null; - FieldType fType = null; - String name = null; - String value = null; - Double dvalue = null; - Store store = Store.NO; - Document doc = new Document(); - - parser.next(); // Skip the [ - while (parser.hasNext()) { - Event ev = parser.next(); - if (ev == Event.KEY_NAME) { - try { - attName = AttributeName.valueOf(parser.getString()); - } catch (Exception e) { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "Found unknown field type " + e.getMessage()); - } - } else if (ev == Event.VALUE_STRING) { - if (attName == AttributeName.type) { - try { - fType = FieldType.valueOf(parser.getString()); - } catch (Exception e) { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "Found unknown field type " + e.getMessage()); - } - } else if (attName == AttributeName.name) { - name = parser.getString(); - } else if (attName == AttributeName.value) { - value = parser.getString(); - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "Bad VALUE_STRING " + attName); - } - } else if (ev == Event.VALUE_NUMBER) { - long num = parser.getLong(); - if (fType == FieldType.SortedDocValuesField) { - value = Long.toString(num); - } else if (fType == FieldType.DoubleField) { - dvalue = parser.getBigDecimal().doubleValue(); - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "Bad VALUE_NUMBER " + attName + " " + fType); - } - } else if (ev == Event.VALUE_TRUE) { - if (attName == AttributeName.store) { - store = Store.YES; - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "Bad VALUE_TRUE " + attName); - } - } else if (ev == Event.START_OBJECT) { - fType = null; - name = null; - value = null; - store = Store.NO; - } else if (ev == Event.END_OBJECT) { - if (fType == FieldType.TextField) { - doc.add(new TextField(name, value, store)); - } else if (fType == FieldType.StringField) { - doc.add(new StringField(name, value, store)); - } else if (fType == FieldType.SortedDocValuesField) { - doc.add(new SortedDocValuesField(name, new BytesRef(value))); - } else if (fType == FieldType.DoubleField) { - doc.add(new DoubleField(name, dvalue, store)); - } - } else if (ev == Event.END_ARRAY) { - if (id == null) { - if (bucket.locked.get() && when == When.Sometime) { - throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, - "Lucene locked for " + entityName); - } - bucket.indexWriter.addDocument(doc); - } else { - if (bucket.locked.get()) { - throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, - "Lucene locked for " + entityName); - } - bucket.indexWriter.updateDocument(new Term("id", id.toString()), doc); - } - return; - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "Unexpected token in Json: " + ev); - } - } - } - /** * Expect an array of documents each encoded as an array of things to add to * the document @@ -295,22 +443,22 @@ private void add(HttpServletRequest request, String entityName, When when, JsonP @Path("addNow/{entityName}") public void addNow(@Context HttpServletRequest request, @PathParam("entityName") String entityName) throws LuceneException { + List documents; + JsonStructure value = null; logger.debug("Requesting addNow of {}", entityName); - int count = 0; - try (JsonParser parser = Json.createParser(request.getInputStream())) { - Event ev = parser.next(); // Opening [ - while (true) { - ev = parser.next(); // Final ] or another document - if (ev == Event.END_ARRAY) { - break; - } - add(request, entityName, When.Now, parser, null); - count++; + try (JsonReader reader = Json.createReader(request.getInputStream())) { + value = reader.read(); + documents = ((JsonArray) value).getValuesAs(JsonObject.class); + for (JsonObject document : documents) { + createNow(entityName, document); } + } catch (JsonException e) { + logger.error("Could not parse JSON from {}", value); + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } - logger.debug("Added {} {} documents", count, entityName); + logger.debug("Added {} {} documents", documents.size(), entityName); } /* @@ -323,11 +471,7 @@ public void clear() throws LuceneException { logger.info("Requesting clear"); exit(); - timer = new Timer("LuceneCommitTimer"); - - bucketNum.set(0); indexBuckets.clear(); - searches.clear(); try { Files.walk(luceneDirectory, FileVisitOption.FOLLOW_LINKS).sorted(Comparator.reverseOrder()) @@ -336,26 +480,24 @@ public void clear() throws LuceneException { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } - timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); + initTimer(); logger.info("clear complete - ready to go again"); } + /** + * Commits any pending documents to their respective index. + */ @POST @Path("commit") public void commit() throws LuceneException { - logger.debug("Requesting commit"); + logger.debug("Requesting commit for {} IndexBuckets", indexBuckets.size()); try { for (Entry entry : indexBuckets.entrySet()) { IndexBucket bucket = entry.getValue(); if (!bucket.locked.get()) { - int cached = bucket.indexWriter.numRamDocs(); - bucket.indexWriter.commit(); - if (cached != 0) { - logger.debug("Synch has committed {} {} changes to Lucene - now have {} documents indexed", - cached, entry.getKey(), bucket.indexWriter.numDocs()); - } - bucket.searcherManager.maybeRefreshBlocking(); + logger.trace("{} is unlocked", entry.getKey()); + bucket.commit("Synch", entry.getKey()); } } } catch (IOException e) { @@ -363,203 +505,328 @@ public void commit() throws LuceneException { } } - private IndexBucket createBucket(String name) { - try { - IndexBucket bucket = new IndexBucket(); - FSDirectory directory = FSDirectory.open(luceneDirectory.resolve(name)); - bucket.directory = directory; - IndexWriterConfig config = new IndexWriterConfig(analyzer); - IndexWriter iwriter = new IndexWriter(directory, config); - String[] files = directory.listAll(); - if (files.length == 1 && files[0].equals("write.lock")) { - logger.debug("Directory only has the write.lock file so store and delete a dummy document"); - Document doc = new Document(); - doc.add(new StringField("dummy", "dummy", Store.NO)); - iwriter.addDocument(doc); - iwriter.commit(); - iwriter.deleteDocuments(new Term("dummy", "dummy")); - iwriter.commit(); - logger.debug("Now have " + iwriter.numDocs() + " documents indexed"); + /** + * Creates a new Lucene document, provided that the target index is not locked + * for another operation. + * + * @param operationBody JsonObject containing the "_index" that the new "doc" + * should be created in. + * @throws NumberFormatException + * @throws IOException + * @throws LuceneException + */ + private void create(JsonObject operationBody) throws NumberFormatException, IOException, LuceneException { + String entityName = operationBody.getString("_index"); + if (DocumentMapping.relationships.containsKey(entityName)) { + updateByRelation(operationBody, false); + } + if (DocumentMapping.indexedEntities.contains(entityName)) { + JsonObject documentObject = operationBody.getJsonObject("doc"); + Document document = parseDocument(documentObject); + logger.trace("create {} {}", entityName, document); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); + if (bucket.locked.get()) { + throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, + "Lucene locked for " + entityName); + } + bucket.addDocument(facetsConfig.build(document)); + // Special case for filesizes + if (aggregateFiles && entityName.equals("Datafile")) { + JsonNumber jsonFileSize = documentObject.getJsonNumber("fileSize"); + if (jsonFileSize != null) { + JsonNumber datasetId = documentObject.getJsonNumber("dataset.id"); + JsonNumber investigationId = documentObject.getJsonNumber("investigation.id"); + aggregateFileSize(jsonFileSize.longValueExact(), 0, 1, datasetId, "dataset"); + aggregateFileSize(jsonFileSize.longValueExact(), 0, 1, investigationId, "investigation"); + } } - bucket.indexWriter = iwriter; - bucket.searcherManager = new SearcherManager(iwriter, false, null); - logger.debug("Bucket for {} is now ready", name); - return bucket; - } catch (Throwable e) { - logger.error("Can't continue " + e.getClass() + " " + e.getMessage()); - return null; } } - @POST - @Consumes(MediaType.APPLICATION_JSON) - @Produces(MediaType.APPLICATION_JSON) - @Path("datafiles") - public String datafiles(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults) - throws LuceneException { - - Long uid = null; - try { - uid = bucketNum.getAndIncrement(); - Search search = new Search(); - searches.put(uid, search); - Map map = new HashMap<>(); - search.map = map; - - try (JsonReader r = Json.createReader(request.getInputStream())) { - JsonObject o = r.readObject(); - String userName = o.getString("user", null); - - BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - - if (userName != null) { - Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(map, "InvestigationUser"), - ScoreMode.None); - - Query invQuery = JoinUtil.createJoinQuery("id", false, "investigation", iuQuery, - getSearcher(map, "Investigation"), ScoreMode.None); - - Query dsQuery = JoinUtil.createJoinQuery("id", false, "dataset", invQuery, - getSearcher(map, "Dataset"), ScoreMode.None); - - theQuery.add(dsQuery, Occur.MUST); - } - - String text = o.getString("text", null); - if (text != null) { - theQuery.add(parser.parse(text, "text"), Occur.MUST); - } - - String lower = o.getString("lower", null); - String upper = o.getString("upper", null); - if (lower != null && upper != null) { - theQuery.add(new TermRangeQuery("date", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - } + /** + * Changes the fileSize on an entity by the specified amount. This is used to + * aggregate the individual fileSize of Datafiles up to Dataset and + * Investigation sizes. + * + * @param sizeToAdd Increases the fileSize of the entity by this much. + * Should be 0 for deletes. + * @param sizeToSubtract Decreases the fileSize of the entity by this much. + * Should be 0 for creates. + * @param deltaFileCount Changes the file count by this much. + * @param entityId Icat id of entity to update as a JsonNumber. + * @param index Index (entity) to update. + * @throws IOException + */ + private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFileCount, JsonNumber entityId, + String index) throws IOException { + if (entityId != null) { + aggregateFileSize(sizeToAdd, sizeToSubtract, deltaFileCount, entityId.longValueExact(), index); + } + } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); - IndexSearcher datafileParameterSearcher = getSearcher(map, "DatafileParameter"); - for (JsonValue p : params) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("datafile", false, "id", paramQuery.build(), - datafileParameterSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); + /** + * Changes the fileSize on an entity by the specified amount. This is used to + * aggregate the individual fileSize of Datafiles up to Dataset and + * Investigation sizes. + * + * @param sizeToAdd Increases the fileSize of the entity by this much. + * Should be 0 for deletes. + * @param sizeToSubtract Decreases the fileSize of the entity by this much. + * Should be 0 for creates. + * @param deltaFileCount Changes the file count by this much. + * @param entityId Icat id of entity to update as a long. + * @param index Index (entity) to update. + * @throws IOException + */ + private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFileCount, long entityId, + String index) throws IOException { + long deltaFileSize = sizeToAdd - sizeToSubtract; + if (deltaFileSize != 0 || deltaFileCount != 0) { + IndexBucket indexBucket = indexBuckets.computeIfAbsent(index, k -> new IndexBucket(k)); + for (ShardBucket shardBucket : indexBucket.shardList) { + shardBucket.commit(); + IndexSearcher searcher = shardBucket.searcherManager.acquire(); + try { + Query idQuery = LongPoint.newExactQuery("id", entityId); + TopDocs topDocs = searcher.search(idQuery, 1); + if (topDocs.totalHits.value == 1) { + int docId = topDocs.scoreDocs[0].doc; + Document document = searcher.doc(docId); + Set prunedFields = new HashSet<>(); + List fieldsToAdd = new ArrayList<>(); + + incrementFileStatistic("fileSize", deltaFileSize, document, prunedFields, fieldsToAdd); + incrementFileStatistic("fileCount", deltaFileCount, document, prunedFields, fieldsToAdd); + + Document newDocument = pruneDocument(prunedFields, document); + fieldsToAdd.forEach(field -> newDocument.add(field)); + shardBucket.indexWriter.deleteDocuments(idQuery); + shardBucket.indexWriter.addDocument(facetsConfig.build(newDocument)); + shardBucket.commit(); + break; } + } finally { + shardBucket.searcherManager.release(searcher); } - search.query = maybeEmptyQuery(theQuery); } - - return luceneSearchResult("Datafile", search, maxResults, uid); - } catch (Exception e) { - logger.error("Error", e); - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } - @GET - @Produces(MediaType.APPLICATION_JSON) - @Path("datafiles/{uid}") - public String datafilesAfter(@PathParam("uid") long uid, @QueryParam("maxResults") int maxResults) - throws LuceneException { - try { - Search search = searches.get(uid); - try { - return luceneSearchResult("Datafile", search, maxResults, null); - } catch (Exception e) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - } catch (Exception e) { - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + /** + * Increments a field relating to file statistics (count, size) as part of the + * update on a Document. + * + * @param statisticName Name of the field to increment, i.e. fileCount or + * fileSize. + * @param statisticDelta Change in the value of the named statistic. + * @param document Lucene Document containing the old statistic value to + * be incremented. + * @param prunedFields Set of fields which need to be removed from the old + * Document. If the statistic is incremented, this will + * have statisticName added to it. + * @param fieldsToAdd List of Lucene IndexableFields to add to the new + * Document. + */ + private void incrementFileStatistic(String statisticName, long statisticDelta, Document document, + Set prunedFields, List fieldsToAdd) { + if (statisticDelta != 0) { + prunedFields.add(statisticName); + long oldValue = document.getField(statisticName).numericValue().longValue(); + long newValue = oldValue + statisticDelta; + fieldsToAdd.add(new LongPoint(statisticName, newValue)); + fieldsToAdd.add(new StoredField(statisticName, newValue)); + fieldsToAdd.add(new NumericDocValuesField(statisticName, newValue)); } } + /** + * Creates a new Lucene document. + * + * @param entityName Name of the entity/index to create the document in. + * @param documentJson JsonObject representation of the document to be created. + * @throws NumberFormatException + * @throws IOException + * @throws LuceneException + */ + private void createNow(String entityName, JsonObject documentJson) + throws NumberFormatException, IOException, LuceneException { + Document document = parseDocument(documentJson); + logger.trace("create {} {}", entityName, document); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); + bucket.addDocument(facetsConfig.build(document)); + } + + /** + * Perform search on the Datafile entity/index. + * + * @param request Incoming Http request containing the query as Json. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results to include in the returned + * Json. + * @param sort String of Json representing the sort criteria. + * @return String of Json representing the results of the search. + * @throws LuceneException + */ @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) - @Path("datasets") - public String datasets(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults) - throws LuceneException { - - Long uid = null; - try { - uid = bucketNum.getAndIncrement(); - Search search = new Search(); - searches.put(uid, search); - Map map = new HashMap<>(); - search.map = map; - try (JsonReader r = Json.createReader(request.getInputStream())) { - JsonObject o = r.readObject(); - String userName = o.getString("user", null); - - BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - - if (userName != null) { - - Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(map, "InvestigationUser"), - ScoreMode.None); + @Path("datafile") + public String datafiles(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, + @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { + return searchEntity(request, searchAfter, maxResults, sort, SearchType.DATAFILE); + } - Query invQuery = JoinUtil.createJoinQuery("id", false, "investigation", iuQuery, - getSearcher(map, "Investigation"), ScoreMode.None); + /** + * Perform search on the Dataset entity/index. + * + * @param request Incoming Http request containing the query as Json. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results to include in the returned + * Json. + * @param sort String of Json representing the sort criteria. + * @return String of Json representing the results of the search. + * @throws LuceneException + */ + @POST + @Consumes(MediaType.APPLICATION_JSON) + @Produces(MediaType.APPLICATION_JSON) + @Path("dataset") + public String datasets(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, + @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { + return searchEntity(request, searchAfter, maxResults, sort, SearchType.DATASET); + } - theQuery.add(invQuery, Occur.MUST); + /** + * Deletes a Lucene document, provided that the target index is not locked for + * another operation. + * + * @param operationBody JsonObject containing the "_index" and the "_id" of the + * Document to be deleted. + * @throws LuceneException + * @throws IOException + */ + private void delete(JsonObject operationBody) throws LuceneException, IOException { + String entityName = operationBody.getString("_index"); + if (DocumentMapping.relationships.containsKey(entityName)) { + updateByRelation(operationBody, true); + } + if (DocumentMapping.indexedEntities.contains(entityName)) { + long icatId = operationBody.getJsonNumber("_id").longValueExact(); + try { + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); + if (bucket.locked.get()) { + throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, + "Lucene locked for " + entityName); } - - String text = o.getString("text", null); - if (text != null) { - theQuery.add(parser.parse(text, "text"), Occur.MUST); + logger.trace("delete {} {}", entityName, icatId); + Query idQuery = LongPoint.newExactQuery("id", icatId); + // Special case for filesizes + if (aggregateFiles && entityName.equals("Datafile")) { + for (ShardBucket shardBucket : bucket.shardList) { + IndexSearcher datafileSearcher = shardBucket.searcherManager.acquire(); + try { + TopDocs topDocs = datafileSearcher.search(idQuery, 1); + if (topDocs.totalHits.value == 1) { + int docId = topDocs.scoreDocs[0].doc; + Document datasetDocument = datafileSearcher.doc(docId); + long sizeToSubtract = datasetDocument.getField("fileSize").numericValue().longValue(); + if (sizeToSubtract > 0) { + long datasetId = datasetDocument.getField("dataset.id").numericValue().longValue(); + long investigationId = datasetDocument.getField("investigation.id").numericValue() + .longValue(); + aggregateFileSize(0, sizeToSubtract, -1, datasetId, "dataset"); + aggregateFileSize(0, sizeToSubtract, -1, investigationId, "investigation"); + } + break; + } + } finally { + shardBucket.searcherManager.release(datafileSearcher); + } + } } - - String lower = o.getString("lower", null); - String upper = o.getString("upper", null); - if (lower != null && upper != null) { - theQuery.add(new TermRangeQuery("startDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - theQuery.add(new TermRangeQuery("endDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); + for (ShardBucket shardBucket : bucket.shardList) { + shardBucket.indexWriter.deleteDocuments(idQuery); } + } catch (IOException e) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } + } + } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); - IndexSearcher datasetParameterSearcher = getSearcher(map, "DatasetParameter"); - for (JsonValue p : params) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("dataset", false, "id", paramQuery.build(), - datasetParameterSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } + /** + * Encodes core Lucene information (keys preceded by underscores) and a + * selection of the Document's source fields to JSON to be returned to + * icat.server. Note that "_id" is the Lucene Document id, and should not be + * confused with the ICAT entity id, which should be denoted by the key "id" + * within the "_source" object. + * + * @param gen JsonGenerator to encode the information to. + * @param hit ScoreDoc representing a single search result. + * @param searcher IndexSearcher used to get the Document for the hit. + * @param search Search object containing the fields to return. + * @throws IOException + * @throws LuceneException + */ + private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, IndexSearcher searcher, + SearchBucket search) + throws IOException, LuceneException { + int luceneDocId = hit.doc; + int shardIndex = hit.shardIndex; + Document document = searcher.doc(luceneDocId); + gen.writeStartObject().write("_id", luceneDocId).write("_shardIndex", shardIndex); + Float score = hit.score; + if (!score.equals(Float.NaN)) { + gen.write("_score", hit.score); + } + gen.writeStartObject("_source"); + document.forEach(encodeField(gen, search.fields)); + for (String joinedEntityName : search.joinedFields.keySet()) { + List searchers = getSearchers(search.searcherMap, joinedEntityName); + List shards = getShards(joinedEntityName); + SearchBucket joinedSearch = new SearchBucket(this); + String fld; + long parentId; + if (joinedEntityName.toLowerCase().contains("investigation")) { + fld = "investigation.id"; + if (entityName.equalsIgnoreCase("investigation")) { + parentId = document.getField("id").numericValue().longValue(); + } else { + parentId = document.getField("investigation.id").numericValue().longValue(); } - search.query = maybeEmptyQuery(theQuery); + } else { + fld = entityName.toLowerCase() + ".id"; + parentId = document.getField("id").numericValue().longValue(); } - return luceneSearchResult("Dataset", search, maxResults, uid); - } catch (Exception e) { - logger.error("Error", e); - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + joinedSearch.query = LongPoint.newExactQuery(fld, parentId); + joinedSearch.sort = new Sort(new SortedNumericSortField("id", Type.LONG)); + TopFieldDocs topFieldDocs = searchShards(joinedSearch, 100, shards); + gen.writeStartArray(joinedEntityName.toLowerCase()); + for (ScoreDoc joinedHit : topFieldDocs.scoreDocs) { + gen.writeStartObject(); + Document joinedDocument = searchers.get(joinedHit.shardIndex).doc(joinedHit.doc); + joinedDocument.forEach(encodeField(gen, search.joinedFields.get(joinedEntityName))); + gen.writeEnd(); + } + gen.writeEnd(); } - + gen.writeEnd().writeEnd(); // source object, result object } - @GET - @Produces(MediaType.APPLICATION_JSON) - @Path("datasets/{uid}") - public String datasetsAfter(@PathParam("uid") long uid, @QueryParam("maxResults") int maxResults) - throws LuceneException { - try { - Search search = searches.get(uid); - try { - return luceneSearchResult("Dataset", search, maxResults, null); - } catch (Exception e) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + private Consumer encodeField(JsonGenerator gen, Set fields) { + return (field) -> { + String fieldName = field.name(); + if (fields.contains(fieldName)) { + if (DocumentMapping.longFields.contains(fieldName)) { + gen.write(fieldName, field.numericValue().longValue()); + } else if (DocumentMapping.doubleFields.contains(fieldName)) { + gen.write(fieldName, field.numericValue().doubleValue()); + } else { + gen.write(fieldName, field.stringValue()); + } } - } catch (Exception e) { - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } + }; } @PreDestroy @@ -571,12 +838,8 @@ private void exit() { timer = null; // This seems to be necessary to make it really stop } try { - for (Entry entry : indexBuckets.entrySet()) { - IndexBucket bucket = entry.getValue(); - bucket.searcherManager.close(); - bucket.indexWriter.commit(); - bucket.indexWriter.close(); - bucket.directory.close(); + for (IndexBucket bucket : indexBuckets.values()) { + bucket.close(); } logger.info("Closed down icat.lucene"); } catch (Exception e) { @@ -584,43 +847,123 @@ private void exit() { } } - @DELETE - @Path("freeSearcher/{uid}") - public void freeSearcher(@PathParam("uid") Long uid) throws LuceneException { - if (uid != null) { // May not be set for internal calls - logger.debug("Requesting freeSearcher {}", uid); - Map search = searches.get(uid).map; - for (Entry entry : search.entrySet()) { + /** + * Perform faceting on an entity/index. The query associated with the request + * should determine which Documents to consider, and optionally the dimensions + * to facet. If no dimensions are provided, "sparse" faceting is performed + * across relevant string fields (but no Range faceting occurs). + * + * @param entityName Name of the entity/index to facet on. + * @param request Incoming Http request containing the query as Json. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results to include in the returned + * Json. + * @param maxLabels The maximum number of labels to return for each dimension + * of the facets. + * @param sort String of Json representing the sort criteria. + * @return String of Json representing the results of the faceting. + * @throws LuceneException + */ + @POST + @Consumes(MediaType.APPLICATION_JSON) + @Produces(MediaType.APPLICATION_JSON) + @Path("{entityName}/facet") + public String facet(@PathParam("entityName") String entityName, @Context HttpServletRequest request, + @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, + @QueryParam("maxLabels") int maxLabels, @QueryParam("sort") String sort) throws LuceneException { + SearchBucket search = null; + try { + search = new SearchBucket(this, SearchType.GENERIC, request, sort, null); + return luceneFacetResult(entityName, search, searchAfter, maxResults, maxLabels); + } catch (IOException | QueryNodeException e) { + logger.error("Error", e); + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } finally { + freeSearcher(search); + } + } + + /** + * Releases all IndexSearchers associated with a SearchBucket. + * + * @param search SearchBucket to be freed. + * @throws LuceneException + */ + public void freeSearcher(SearchBucket search) throws LuceneException { + if (search != null) { + for (Entry> entry : search.searcherMap.entrySet()) { String name = entry.getKey(); - IndexSearcher isearcher = entry.getValue(); - SearcherManager manager = indexBuckets.computeIfAbsent(name, k -> createBucket(k)).searcherManager; + List subReaders = entry.getValue(); try { - manager.release(isearcher); + indexBuckets.computeIfAbsent(name.toLowerCase(), k -> new IndexBucket(k)) + .releaseSearchers(subReaders); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } - searches.remove(uid); } } - /* - * Need a new set of IndexSearchers for each search as identified by a uid + /** + * Gets all IndexSearchers needed for the shards of a given entity/index. + * + * @param searcherMap Map of entity names to their IndexSearchers. + * @param name Name of the entity to get the IndexSearchers for. + * @return List of IndexSearchers for name. + * @throws IOException + */ + private List getSearchers(Map> searcherMap, String name) + throws IOException { + String nameLowercase = name.toLowerCase(); + logger.trace("Get searchers for {}", nameLowercase); + List subSearchers = searcherMap.get(nameLowercase); + if (subSearchers == null) { + logger.trace("No searchers found for {}", nameLowercase); + subSearchers = indexBuckets.computeIfAbsent(nameLowercase, k -> new IndexBucket(k)).acquireSearchers(); + searcherMap.put(nameLowercase, subSearchers); + logger.debug("Remember searcher for {}", nameLowercase); + } + return subSearchers; + } + + /** + * Gets a single IndexSearcher for name. When multiple shards are possible, + * getSearchers should be used instead. + * + * @param searcherMap Map of entity names to their IndexSearchers. + * @param name Name of the entity to get the IndexSearcher for. + * @return The IndexSearcher for name. + * @throws IOException + * @throws LuceneException If there are more than one shard for name. */ - private IndexSearcher getSearcher(Map bucket, String name) throws IOException { - IndexSearcher isearcher = bucket.get(name); - if (isearcher == null) { - isearcher = indexBuckets.computeIfAbsent(name, k -> createBucket(k)).searcherManager.acquire(); - bucket.put(name, isearcher); - logger.debug("Remember searcher for {}", name); + public IndexSearcher getSearcher(Map> searcherMap, String name) + throws IOException, LuceneException { + List subSearchers = searcherMap.get(name); + subSearchers = getSearchers(searcherMap, name); + if (subSearchers.size() != 1) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, + "Cannot get single IndexSearcher for " + name + " as it has " + subSearchers.size() + " shards"); } - return isearcher; + return subSearchers.get(0); + } + + /** + * Gets all ShardBuckets of a given entity/index. + * + * @param name Name of the entity to get the ShardBuckets for. + * @return List of ShardBuckets for name. + */ + private List getShards(String name) { + return indexBuckets.computeIfAbsent(name.toLowerCase(), k -> new IndexBucket(k)).shardList; } @PostConstruct private void init() { logger.info("Initialising icat.lucene"); CheckedProperties props = new CheckedProperties(); + String unitsString; + int commitSeconds; try { props.loadFromResource("run.properties"); @@ -629,24 +972,39 @@ private void init() { throw new Exception(luceneDirectory + " is not a directory"); } - luceneCommitMillis = props.getPositiveInt("commitSeconds") * 1000; - - analyzer = new IcatAnalyzer(); + commitSeconds = props.getPositiveInt("commitSeconds"); + luceneCommitMillis = commitSeconds * 1000; + luceneMaxShardSize = Math.max(props.getPositiveLong("maxShardSize"), Long.valueOf(Integer.MAX_VALUE + 1)); + maxSearchTimeSeconds = props.has("maxSearchTimeSeconds") ? props.getPositiveLong("maxSearchTimeSeconds") + : 5; + aggregateFiles = props.getBoolean("aggregateFiles", false); - parser = new StandardQueryParser(); - StandardQueryConfigHandler qpConf = (StandardQueryConfigHandler) parser.getQueryConfigHandler(); - qpConf.set(ConfigurationKeys.ANALYZER, analyzer); - qpConf.set(ConfigurationKeys.ALLOW_LEADING_WILDCARD, true); + initTimer(); - timer = new Timer("LuceneCommitTimer"); - timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); + unitsString = props.getString("units", ""); + icatUnits = new IcatUnits(unitsString); + String facetFieldsString = props.getString("facetFields", ""); + for (String facetField : facetFieldsString.split("\\s+")) { + facetFields.add(facetField); + } } catch (Exception e) { logger.error(fatal, e.getMessage()); throw new IllegalStateException(e.getMessage()); } - logger.info("Initialised icat.lucene"); + String format = "Initialised icat.lucene with directory {}, commitSeconds {}, maxShardSize {}, " + + "maxSearchTimeSeconds {}, aggregateFiles {}, units {}, facetFields {}"; + logger.info(format, luceneDirectory, commitSeconds, luceneMaxShardSize, maxSearchTimeSeconds, + aggregateFiles, unitsString, facetFields); + } + + /** + * Starts a timer and schedules regular commits of the IndexWriter. + */ + private void initTimer() { + timer = new Timer("LuceneCommitTimer"); + timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); } class CommitTimerTask extends TimerTask { @@ -660,220 +1018,762 @@ public void run() { } } + /** + * Perform search on the Investigation entity/index. + * + * @param request Incoming Http request containing the query as Json. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results to include in the returned + * Json. + * @param sort String of Json representing the sort criteria. + * @return String of Json representing the results of the search. + * @throws LuceneException + */ @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) - @Path("investigations") - public String investigations(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults) - throws LuceneException { - Long uid = null; - try { - uid = bucketNum.getAndIncrement(); - Search search = new Search(); - searches.put(uid, search); - Map map = new HashMap<>(); - search.map = map; - try (JsonReader r = Json.createReader(request.getInputStream())) { - JsonObject o = r.readObject(); - String userName = o.getString("user", null); - - BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - - if (userName != null) { - Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(map, "InvestigationUser"), - ScoreMode.None); - theQuery.add(iuQuery, Occur.MUST); - } + @Path("investigation") + public String investigations(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, + @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { + return searchEntity(request, searchAfter, maxResults, sort, SearchType.INVESTIGATION); + } - String text = o.getString("text", null); - if (text != null) { - theQuery.add(parser.parse(text, "text"), Occur.MUST); - } + /** + * Locks the specified index for population, optionally removing all existing + * documents and preventing normal modify operations until the index is + * unlocked. + * + * A check is also performed against the minId and maxId used for population. + * This ensures that no data is duplicated in the index. + * + * @param entityName Name of the entity/index to lock. + * @param minId The exclusive minimum ICAT id being populated for. If + * Documents already exist with an id greater than this, the + * lock will fail. If null, treated as if it were + * Long.MIN_VALUE + * @param maxId The inclusive maximum ICAT id being populated for. If + * Documents already exist with an id less than or equal to + * this, the lock will fail. If null, treated as if it were + * Long.MAX_VALUE + * @param delete Whether to delete all existing Documents on the index. + * @throws LuceneException If already locked, if there's an IOException when + * deleting documents, or if the min/max id values are + * provided and Documents already exist in that range. + */ + @POST + @Path("lock/{entityName}") + public void lock(@PathParam("entityName") String entityName, @QueryParam("minId") Long minId, + @QueryParam("maxId") Long maxId, @QueryParam("delete") boolean delete) throws LuceneException { + try { + logger.info("Requesting lock of {} index, minId={}, maxId={}, delete={}", entityName, minId, maxId, delete); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); - String lower = o.getString("lower", null); - String upper = o.getString("upper", null); - if (lower != null && upper != null) { - theQuery.add(new TermRangeQuery("startDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - theQuery.add(new TermRangeQuery("endDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); + if (!bucket.locked.compareAndSet(false, true)) { + String message = "Lucene already locked for " + entityName; + throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, message); + } + if (delete) { + for (ShardBucket shardBucket : bucket.shardList) { + shardBucket.indexWriter.deleteAll(); } + // Reset the shardList so we reset the routing + ShardBucket shardBucket = bucket.shardList.get(0); + bucket.shardList = new ArrayList<>(); + bucket.shardList.add(shardBucket); + return; + } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); - IndexSearcher investigationParameterSearcher = getSearcher(map, "InvestigationParameter"); - - for (JsonValue p : params) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", paramQuery.build(), - investigationParameterSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); + for (ShardBucket shardBucket : bucket.shardList) { + IndexSearcher searcher = shardBucket.searcherManager.acquire(); + try { + Query query; + if (minId == null && maxId == null) { + query = new MatchAllDocsQuery(); + } else { + if (minId == null) { + minId = Long.MIN_VALUE; + } + if (maxId == null) { + maxId = Long.MAX_VALUE; + } + query = LongPoint.newRangeQuery("id", minId + 1, maxId); + } + TopDocs topDoc = searcher.search(query, 1); + if (topDoc.scoreDocs.length != 0) { + // If we have any results in the populating range, unlock and throw + bucket.locked.compareAndSet(true, false); + Document doc = searcher.doc(topDoc.scoreDocs[0].doc); + long id = doc.getField("id").numericValue().longValue(); + String message = "While locking index, found id " + id + " in specified range"; + logger.error(message); + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, message); } + } finally { + shardBucket.searcherManager.release(searcher); } + } + } catch (IOException e) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } + } - if (o.containsKey("samples")) { - JsonArray samples = o.getJsonArray("samples"); - IndexSearcher sampleSearcher = getSearcher(map, "Sample"); - - for (JsonValue s : samples) { - JsonString sample = (JsonString) s; - BooleanQuery.Builder sampleQuery = new BooleanQuery.Builder(); - sampleQuery.add(parser.parse(sample.getString(), "text"), Occur.MUST); - Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", sampleQuery.build(), - sampleSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); + /** + * Perform faceting on an entity/index. + * + * @param name Entity/index to facet. + * @param search SearchBucket containing the search query, dimensions to + * facet etc. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results from the search. + * @param maxLabels The maximum number of labels to return for each dimension + * of the facets. + * @return String of Json representing the facets of the search results. + * @throws IOException + * @throws IllegalStateException If the IndexSearcher and its DirectoryReader + * are not in sync. + * @throws LuceneException If ranges are provided for a non-numeric field, + * or something else goes wrong. + */ + private String luceneFacetResult(String name, SearchBucket search, String searchAfter, int maxResults, + int maxLabels) throws IOException, IllegalStateException, LuceneException { + // If no dimensions were specified, perform "sparse" faceting on all applicable + // string values + boolean sparse = search.dimensions.size() == 0; + // By default, assume we do not need to perform string based faceting for + // specific dimensions + boolean facetStrings = false; + if (maxResults <= 0 || maxLabels <= 0) { + // This will result in no Facets and a null pointer, so return early + logger.warn("Cannot facet when maxResults={}, maxLabels={}, returning empty list", maxResults, maxLabels); + } else { + // Iterate over shards and aggregate the facets from each + logger.debug("Faceting {} with {} after {} ", name, search.query, searchAfter); + List shards = getShards(name); + for (ShardBucket shard : shards) { + FacetsCollector facetsCollector = new FacetsCollector(); + IndexSearcher indexSearcher = shard.searcherManager.acquire(); + try { + TopDocs results = FacetsCollector.search(indexSearcher, search.query, maxResults, facetsCollector); + logger.debug("{}", results.totalHits); + for (FacetedDimension facetedDimension : search.dimensions.values()) { + facetStrings = facetRanges(maxLabels, facetStrings, facetsCollector, facetedDimension); + } + if (shard.state == null) { + logger.debug("State not set, this is most likely due to not having any facetable fields"); + continue; + } else if (shard.state.reader != indexSearcher.getIndexReader()) { + logger.warn("Attempted search with outdated state, create new one from current IndexReader"); + shard.state = new DefaultSortedSetDocValuesReaderState(indexSearcher.getIndexReader()); } + facetStrings(search, maxLabels, sparse, facetStrings, indexSearcher, facetsCollector, shard.state); + } finally { + shard.searcherManager.release(indexSearcher); } + } + } + // Build results + JsonObjectBuilder aggregationsBuilder = Json.createObjectBuilder(); + search.dimensions.values().forEach(facetedDimension -> facetedDimension.buildResponse(aggregationsBuilder)); + String aggregations = Json.createObjectBuilder().add("aggregations", aggregationsBuilder).build().toString(); + logger.debug("aggregations: {}", aggregations); + return aggregations; + } - String userFullName = o.getString("userFullName", null); - if (userFullName != null) { - BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); - userFullNameQuery.add(parser.parse(userFullName, "text"), Occur.MUST); - IndexSearcher investigationUserSearcher = getSearcher(map, "InvestigationUser"); - Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", userFullNameQuery.build(), - investigationUserSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } + /** + * Performs range based faceting on the provided facetedDimension, if possible. + * + * @param maxLabels The maximum number of labels to collect for each + * facet + * @param facetStrings Whether there a String dimensions that will need + * faceting later + * @param facetsCollector Lucene FacetsCollector used to count results + * @param facetedDimension Representation of the dimension to facet, and used to + * store the results of the faceting + * @return If a string dimension was encountered, returns true. Otherwise, + * returns the value of facetStrings originally passed. + * @throws IOException + * @throws LuceneException + */ + private boolean facetRanges(int maxLabels, boolean facetStrings, FacetsCollector facetsCollector, + FacetedDimension facetedDimension) throws IOException, LuceneException { + if (facetedDimension.getRanges().size() > 0) { + logger.debug("Ranges: {}", facetedDimension.getRanges().get(0).getClass().getSimpleName()); + // Perform range based facets for a numeric field + String dimension = facetedDimension.getDimension(); + Facets facets; + if (DocumentMapping.longFields.contains(dimension)) { + LongRange[] ranges = facetedDimension.getRanges().toArray(new LongRange[0]); + facets = new LongRangeFacetCounts(dimension, facetsCollector, ranges); + } else if (DocumentMapping.doubleFields.contains(dimension)) { + DoubleRange[] ranges = facetedDimension.getRanges().toArray(new DoubleRange[0]); + facets = new DoubleRangeFacetCounts(dimension, facetsCollector, ranges); + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "'ranges' specified for dimension " + dimension + + " but this is not a supported numeric field"); + } + FacetResult facetResult = facets.getTopChildren(maxLabels, dimension); + facetedDimension.addResult(facetResult); + } else { + // Have a specific string dimension to facet, but these should all be done at + // once for efficiency + facetStrings = true; + } + return facetStrings; + } - search.query = maybeEmptyQuery(theQuery); + /** + * Performs String based faceting. Either this will be sparse (all fields + * targetted) or it will occur for specifc fields only. + * + * @param search Bucket being used for this search + * @param maxLabels The maximum number of labels to collect for each facet + * @param sparse Whether to perform sparse faceting (faceting across + * all String fields) + * @param facetStrings Whether specific String dimensions should be faceted + * @param indexSearcher Lucene IndexSearcher used to generate the ReaderState + * @param facetsCollector Lucene FacetsCollector used to count results + * @param state Lucene State used to count results + * @throws IOException + */ + private void facetStrings(SearchBucket search, int maxLabels, boolean sparse, boolean facetStrings, + IndexSearcher indexSearcher, FacetsCollector facetsCollector, DefaultSortedSetDocValuesReaderState state) + throws IOException { + try { + logger.trace("String faceting"); + Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); + if (sparse) { + // Facet all applicable string fields + addFacetResults(maxLabels, search.dimensions, facets); + logger.trace("Sparse string faceting found results for {} dimensions", search.dimensions.size()); + } else if (facetStrings) { + // Only add facets to the results if they match one of the requested dimensions + List facetResults = facets.getAllDims(maxLabels); + for (FacetResult facetResult : facetResults) { + String dimension = facetResult.dim.replace(".keyword", ""); + FacetedDimension facetedDimension = search.dimensions.get(dimension); + logger.trace("String facets found for {}, requested dimensions were {}", dimension, + search.dimensions.keySet()); + if (facetedDimension != null) { + facetedDimension.addResult(facetResult); + } + } } - logger.info("Query: {}", search.query); - return luceneSearchResult("Investigation", search, maxResults, uid); - } catch (Exception e) { - logger.error("Error", e); - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } catch (IllegalArgumentException e) { + // This can occur if no fields in the index have been faceted + logger.error( + "No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); + } catch (IllegalStateException e) { + // This can occur if we do not create the IndexSearcher from the same + // DirectoryReader as we used to create the state + logger.error("IndexSearcher used is not based on the DirectoryReader used for facet counting: " + + e.getClass() + " " + e.getMessage()); + throw e; } + } + /** + * Add Facets for all dimensions. This will create FacetDimension Objects if the + * do not already exist in the facetedDimensionMap, otherwise the counts for + * each label will be aggregated. + * + * @param maxLabels The maximum number of labels for a given + * dimension. This labels with the highest counts are + * returned first. + * @param facetedDimensionMap Map containing the dimensions that have been or + * should be faceted. + * @param facets Lucene facets object containing all dimensions. + * @throws IOException + */ + private void addFacetResults(int maxLabels, Map facetedDimensionMap, Facets facets) + throws IOException { + for (FacetResult facetResult : facets.getAllDims(maxLabels)) { + String dim = facetResult.dim.replace(".keyword", ""); + logger.trace("Sparse faceting: FacetResult for {}", dim); + FacetedDimension facetedDimension = facetedDimensionMap.get(dim); + if (facetedDimension == null) { + facetedDimension = new FacetedDimension(dim); + facetedDimensionMap.put(dim, facetedDimension); + } + facetedDimension.addResult(facetResult); + } } - @GET - @Produces(MediaType.APPLICATION_JSON) - @Path("investigations/{uid}") - public String investigationsAfter(@PathParam("uid") long uid, @QueryParam("maxResults") int maxResults) - throws LuceneException { + /** + * Perform search on the specified entity/index. + * + * @param request Incoming Http request containing the query as Json. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results to include in the returned + * Json. + * @param sort String of Json representing the sort criteria. + * @param searchType The type of search query to build, corresponding to one of + * the main entities. + * @return String of Json representing the results of the search. + * @throws LuceneException + */ + private String searchEntity(HttpServletRequest request, String searchAfter, int maxResults, String sort, + SearchType searchType) throws LuceneException { + SearchBucket search = null; try { - Search search = searches.get(uid); - try { - return luceneSearchResult("Investigation", search, maxResults, null); - } catch (Exception e) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - } catch (Exception e) { - freeSearcher(uid); + search = new SearchBucket(this, searchType, request, sort, searchAfter); + return luceneSearchResult(searchType.toString(), search, searchAfter, maxResults); + } catch (IOException | QueryNodeException e) { + logger.error("Error", e); throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } finally { + freeSearcher(search); } } - @POST - @Path("lock/{entityName}") - public void lock(@PathParam("entityName") String entityName) throws LuceneException { - logger.info("Requesting lock of {} index", entityName); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> createBucket(k)); - - if (!bucket.locked.compareAndSet(false, true)) { - throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene already locked for " + entityName); + /** + * Perform search on name. + * + * @param name Entity/index to search. + * @param search SearchBucket containing the search query, dimensions to + * facet etc. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results from the search. + * @return String of Json representing the results of the search. + * @throws IOException + * @throws LuceneException + */ + private String luceneSearchResult(String name, SearchBucket search, String searchAfter, int maxResults) + throws IOException, LuceneException { + List searchers = getSearchers(search.searcherMap, name); + List shards = getShards(name); + String format = "Search {} with: query {}, maxResults {}, searchAfter {}, scored {}, fields {}"; + logger.debug(format, name, search.query, maxResults, searchAfter, search.scored, search.fields); + TopFieldDocs topFieldDocs = searchShards(search, maxResults, shards); + ScoreDoc[] hits = topFieldDocs.scoreDocs; + TotalHits totalHits = topFieldDocs.totalHits; + SortField[] fields = topFieldDocs.fields; + Float maxScore = Float.NaN; + if (hits.length > 0) { + maxScore = hits[0].score; } + logger.debug("{} maxscore {}", totalHits, maxScore); + return encodeResults(name, search, maxResults, searchers, hits, fields); + } + + /** + * Performs a search by iterating over all relevant shards. + * + * @param search SearchBucket containing the search query, dimensions to + * facet etc. + * @param maxResults The maximum number of results from the search. + * @param shards List of all ShardBuckets for the entity to be searched. + * @return Lucene TopFieldDocs resulting from the search. + * @throws IOException + * @throws LuceneException If the search runs for longer than the allowed time + */ + private TopFieldDocs searchShards(SearchBucket search, int maxResults, List shards) + throws IOException, LuceneException { + + TopFieldDocs topFieldDocs; + Counter clock = TimeLimitingCollector.getGlobalCounter(); + TimeLimitingCollector collector = new TimeLimitingCollector(null, clock, maxSearchTimeSeconds * 1000); + try { - bucket.indexWriter.deleteAll(); - } catch (IOException e) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + List shardHits = new ArrayList<>(); + int doc = search.searchAfter != null ? search.searchAfter.doc : -1; + for (ShardBucket shard : shards) { + // Handle the possibility of some shards having a higher docCount than the doc + // id on searchAfter + int docCount = shard.documentCount.intValue(); + if (search.searchAfter != null) { + if (doc > docCount) { + search.searchAfter.doc = docCount - 1; + } else { + search.searchAfter.doc = doc; + } + } + + // Wrap Collector with TimeLimitingCollector + TopFieldCollector topFieldCollector = TopFieldCollector.create(search.sort, maxResults, + search.searchAfter, maxResults); + collector.setCollector(topFieldCollector); + + IndexSearcher indexSearcher = shard.searcherManager.acquire(); + try { + indexSearcher.search(search.query, collector); + TopFieldDocs topDocs = topFieldCollector.topDocs(); + if (search.scored) { + TopFieldCollector.populateScores(topDocs.scoreDocs, indexSearcher, search.query); + } + shardHits.add(topDocs); + } finally { + shard.searcherManager.release(indexSearcher); + } + } + topFieldDocs = TopFieldDocs.merge(search.sort, 0, maxResults, shardHits.toArray(new TopFieldDocs[0]), + true); + + return topFieldDocs; + + } catch (TimeExceededException e) { + String message = "Search cancelled for exceeding " + maxSearchTimeSeconds + " seconds"; + throw new LuceneException(HttpURLConnection.HTTP_GATEWAY_TIMEOUT, message); } } - private String luceneSearchResult(String name, Search search, int maxResults, Long uid) throws IOException { - IndexSearcher isearcher = getSearcher(search.map, name); - logger.debug("To search in {} for {} {} with {} from {} ", name, search.query, maxResults, isearcher, - search.lastDoc); - TopDocs topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults) - : isearcher.searchAfter(search.lastDoc, search.query, maxResults); - ScoreDoc[] hits = topDocs.scoreDocs; - logger.debug("Hits " + topDocs.totalHits + " maxscore " + topDocs.getMaxScore()); + /** + * Encodes the results of a search into Json. + * + * @param name Entity/index that has been searched search + * @param search SearchBucket containing the search query, dimensions to + * facet etc. + * @param maxResults The maximum number of results from the search + * @param searchers List of IndexSearchers for the given name + * @param hits Array of the scored hits from the search + * @param fields SortFields that were used to sort the hits + * @return String of Json encoded results + * @throws IOException + * @throws LuceneException + */ + private String encodeResults(String name, SearchBucket search, int maxResults, List searchers, + ScoreDoc[] hits, SortField[] fields) throws IOException, LuceneException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); + int shardIndex = -1; try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject(); - if (uid != null) { - gen.write("uid", uid); - } gen.writeStartArray("results"); for (ScoreDoc hit : hits) { - Document doc = isearcher.doc(hit.doc); - gen.writeStartArray(); - gen.write(Long.parseLong(doc.get("id"))); - gen.write(hit.score); - gen.writeEnd(); // array + shardIndex = hit.shardIndex; + encodeResult(name, gen, hit, searchers.get(shardIndex), search); } gen.writeEnd(); // array results - gen.writeEnd(); // object + if (hits.length == maxResults) { + ScoreDoc lastDoc = hits[hits.length - 1]; + shardIndex = lastDoc.shardIndex; + gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", shardIndex); + float lastScore = lastDoc.score; + if (!Float.isNaN(lastScore)) { + gen.write("score", lastScore); + } + if (fields != null) { + Document lastDocument = searchers.get(shardIndex).doc(lastDoc.doc); + gen.writeStartArray("fields"); + for (SortField sortField : fields) { + encodeSearchAfterField(gen, sortField, lastDoc, lastDocument); + } + gen.writeEnd(); // end "fields" array + } + gen.writeEnd(); // end "search_after" object + } + gen.writeEnd(); // end enclosing object + } catch (ArrayIndexOutOfBoundsException e) { + String message = "Attempting to access searcher with shardIndex " + shardIndex + ", but only have " + + searchers.size() + " searchers in total"; + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, message); } - - search.lastDoc = hits.length == 0 ? null : hits[hits.length - 1]; - logger.debug("Json returned {}", baos.toString()); + logger.trace("Json returned {}", baos); return baos.toString(); } - private Query maybeEmptyQuery(Builder theQuery) { - Query query = theQuery.build(); - if (query.toString().isEmpty()) { - query = new MatchAllDocsQuery(); + /** + * Encodes a single SortField used in the search into the Json as to enable the + * ability to "search after" the last result of a previous search. + * + * @param gen JsonGenerator used to encode the results + * @param sortField SortField used to sort the hits + * @param lastDoc The final scored hit of the search + * @param lastDocument The full Document corresponding to the last hit of the + * search + * @throws LuceneException + */ + private void encodeSearchAfterField(JsonGenerator gen, SortField sortField, ScoreDoc lastDoc, Document lastDocument) + throws LuceneException { + String fieldName = sortField.getField(); + if (fieldName == null) { + // SCORE sorting will have a null fieldName + if (Float.isFinite(lastDoc.score)) { + gen.write(lastDoc.score); + } + return; + } + IndexableField indexableField = lastDocument.getField(fieldName); + if (indexableField == null) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " + fieldName + + " used for sorting was not present on the Lucene Document; all sortable fields must also be stored."); + } + Type type = (sortField instanceof SortedNumericSortField) + ? ((SortedNumericSortField) sortField).getNumericType() + : sortField.getType(); + switch (type) { + case LONG: + if (indexableField.numericValue() != null) { + gen.write(indexableField.numericValue().longValue()); + } else if (indexableField.stringValue() != null) { + gen.write(Long.valueOf(indexableField.stringValue())); + } + break; + case DOUBLE: + if (indexableField.numericValue() != null) { + gen.write(indexableField.numericValue().doubleValue()); + } else if (indexableField.stringValue() != null) { + gen.write(Double.valueOf(indexableField.stringValue())); + } + break; + case STRING: + gen.write(indexableField.stringValue()); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, + "SortField.Type must be one of LONG, DOUBLE, STRING, but it was " + type); + } + } + + /** + * Builds a Lucene Document from the parsed json. + * + * @param json Key value pairs of fields. + * @return Lucene Document. + */ + private Document parseDocument(JsonObject json) { + Document document = new Document(); + for (String key : json.keySet()) { + Field field = new Field(json, key, facetFields); + field.addToDocument(document); + convertUnits(json, document, key); } - logger.debug("Lucene query {}", query); - return query; + return document; } - private Builder parseParameter(JsonValue p) { - JsonObject parameter = (JsonObject) p; - BooleanQuery.Builder paramQuery = new BooleanQuery.Builder(); - String pName = parameter.getString("name", null); - if (pName != null) { - paramQuery.add(new WildcardQuery(new Term("name", pName)), Occur.MUST); + /** + * If key is "type.units", all relevant numeric fields are converted to SI units + * and added to the document. + * + * @param json A JsonObject representing the Document to be built + * @param document The new Document being built + * @param key A key present in json + * @retrun Whether a conversion has been performed or not + */ + private boolean convertUnits(JsonObject json, Document document, String key) { + // Whenever the units are set or changed, convert to SI + if (key.equals("type.units")) { + String unitString = json.getString("type.units"); + convertValue(document, json, unitString, "numericValue"); + convertValue(document, json, unitString, "rangeTop"); + convertValue(document, json, unitString, "rangeBottom"); + return true; } + return false; + } - String pUnits = parameter.getString("units", null); - if (pUnits != null) { - paramQuery.add(new WildcardQuery(new Term("units", pUnits)), Occur.MUST); + /** + * Attempts to convert numericFieldName from json into SI units from its + * recorded unitString, and then add it to the Lucene document. + * + * @param document Lucene Document to add the field to. + * @param json JsonObject containing the field/value pairs to be + * added. + * @param unitString Units of the value to be converted. + * @param numericFieldName Name (key) of the field to convert and add. + */ + private void convertValue(Document document, JsonObject json, String unitString, String numericFieldName) { + IndexableField field = document.getField(numericFieldName); + double numericalValue; + if (field != null) { + numericalValue = NumericUtils.sortableLongToDouble(field.numericValue().longValue()); + } else if (json.containsKey(numericFieldName)) { + numericalValue = json.getJsonNumber(numericFieldName).doubleValue(); + } else { + // If we aren't dealing with the desired numeric field don't convert + return; + } + logger.trace("Attempting to convert {} {}", numericalValue, unitString); + Value value = icatUnits.convertValueToSiUnits(numericalValue, unitString); + if (value != null) { + document.add(new StringField("type.unitsSI", value.units, Store.YES)); + document.add(new DoublePoint(numericFieldName + "SI", value.numericalValue)); + document.add(new StoredField(numericFieldName + "SI", value.numericalValue)); + long sortableLong = NumericUtils.doubleToSortableLong(value.numericalValue); + document.add(new NumericDocValuesField(numericFieldName + "SI", sortableLong)); } - String pStringValue = parameter.getString("stringValue", null); - String pLowerDateValue = parameter.getString("lowerDateValue", null); - String pUpperDateValue = parameter.getString("upperDateValue", null); - Double pLowerNumericValue = parameter.containsKey("lowerNumericValue") - ? parameter.getJsonNumber("lowerNumericValue").doubleValue() : null; - Double pUpperNumericValue = parameter.containsKey("upperNumericValue") - ? parameter.getJsonNumber("upperNumericValue").doubleValue() : null; - if (pStringValue != null) { - paramQuery.add(new WildcardQuery(new Term("stringValue", pStringValue)), Occur.MUST); - } else if (pLowerDateValue != null && pUpperDateValue != null) { - paramQuery.add(new TermRangeQuery("dateTimeValue", new BytesRef(pLowerDateValue), - new BytesRef(pUpperDateValue), true, true), Occur.MUST); + } - } else if (pLowerNumericValue != null && pUpperNumericValue != null) { - paramQuery.add(NumericRangeQuery.newDoubleRange("numericValue", pLowerNumericValue, pUpperNumericValue, - true, true), Occur.MUST); + /** + * Returns a new Lucene Document that has the same fields as were present in + * oldDocument, except in cases where json has an entry for that field. In this + * case, the json value is used instead. + * + * @param json Key value pairs of fields to overwrite fields already + * present in oldDocument. + * @param oldDocument Lucene Document to be updated. + * @return Lucene Document with updated fields. + */ + private Document updateDocumentFields(JsonObject json, Document oldDocument) { + Document newDocument = new Document(); + List fieldsSI = new ArrayList<>(); + boolean hasNewUnits = false; + for (IndexableField field : oldDocument.getFields()) { + String fieldName = field.name(); + if (json.containsKey(fieldName)) { + Field jsonField = new Field(json, fieldName, facetFields); + jsonField.addToDocument(newDocument); + hasNewUnits = hasNewUnits || convertUnits(json, newDocument, fieldName); + } else if (fieldName.endsWith("SI")) { + fieldsSI.add(new Field(field, facetFields)); + } else { + Field oldField = new Field(field, facetFields); + oldField.addToDocument(newDocument); + } + } + if (!hasNewUnits) { + fieldsSI.forEach((field) -> { + field.addToDocument(newDocument); + }); } - return paramQuery; + return newDocument; } + /** + * Returns a new Lucene Document that has the same fields as were present in + * oldDocument, except those provided as an argument to prune. + * + * @param fields These fields will not + * be present in the returned Document. + * @param oldDocument Lucene Document to be pruned. + * @return Lucene Document with pruned fields. + */ + private Document pruneDocument(Set fields, Document oldDocument) { + Document newDocument = new Document(); + for (IndexableField field : oldDocument.getFields()) { + if (!fields.contains(field.name())) { + Field fieldToAdd = new Field(field, facetFields); + fieldToAdd.addToDocument(newDocument); + } + } + return newDocument; + } + + /** + * Unlocks the specified index after population, committing all pending + * documents + * and allowing normal modify operations again. + * + * @param entityName Name of the entity/index to unlock. + * @throws LuceneException If not locked, or if there's an IOException when + * committing documents. + */ @POST @Path("unlock/{entityName}") public void unlock(@PathParam("entityName") String entityName) throws LuceneException { logger.debug("Requesting unlock of {} index", entityName); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> createBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (!bucket.locked.compareAndSet(true, false)) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene is not currently locked for " + entityName); } try { - int cached = bucket.indexWriter.numRamDocs(); - bucket.indexWriter.commit(); - if (cached != 0) { - logger.debug("Unlock has committed {} {} changes to Lucene - now have {} documents indexed", cached, - entityName, bucket.indexWriter.numDocs()); - } - bucket.searcherManager.maybeRefreshBlocking(); + bucket.commit("Unlock", entityName); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } + /** + * Updates an existing Lucene document, provided that the target index is not + * locked for another operation. + * + * @param operationBody JsonObject containing the "_index" that the new "doc" + * should be created in. + * @throws LuceneException + * @throws NumberFormatException + * @throws IOException + */ + private void update(JsonObject operationBody) throws LuceneException, NumberFormatException, IOException { + String entityName = operationBody.getString("_index"); + if (DocumentMapping.relationships.containsKey(entityName)) { + updateByRelation(operationBody, false); + } + if (DocumentMapping.indexedEntities.contains(entityName)) { + long icatId = operationBody.getJsonNumber("_id").longValueExact(); + JsonObject documentObject = operationBody.getJsonObject("doc"); + Document document = parseDocument(documentObject); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); + if (bucket.locked.get()) { + throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, + "Lucene locked for " + entityName); + } + // Special case for filesizes + if (aggregateFiles && entityName.equals("Datafile")) { + JsonNumber jsonFileSize = documentObject.getJsonNumber("fileSize"); + if (jsonFileSize != null) { + long sizeToSubtract = 0; + List datafileSearchers = bucket.acquireSearchers(); + for (IndexSearcher datafileSearcher : datafileSearchers) { + TopDocs topDocs = datafileSearcher.search(LongPoint.newExactQuery("id", icatId), 1); + if (topDocs.totalHits.value == 1) { + int docId = topDocs.scoreDocs[0].doc; + Document datasetDocument = datafileSearcher.doc(docId); + sizeToSubtract = datasetDocument.getField("fileSize").numericValue().longValue(); + long sizeToAdd = jsonFileSize.longValueExact(); + if (sizeToAdd != sizeToSubtract) { + JsonNumber datasetId = documentObject.getJsonNumber("dataset.id"); + JsonNumber investigationId = documentObject.getJsonNumber("investigation.id"); + aggregateFileSize(sizeToAdd, sizeToSubtract, 0, datasetId, "dataset"); + aggregateFileSize(sizeToAdd, sizeToSubtract, 0, investigationId, "investigation"); + } + break; + } + } + } + } + logger.trace("update: {}", document); + bucket.updateDocument(icatId, facetsConfig.build(document)); + } + } + + /** + * Updates an existing Lucene document, provided that the target index is not + * locked + * for another operation. In this case, the entity being updated does not have + * its own index, but exists as fields on a parent. For example, + * InvestigationType on an Investigation. + * + * @param operationBody JsonObject containing the "_index" that the new "doc" + * should be created in. + * @param delete Whether to delete the related entity (or just update its + * values). + * @throws LuceneException + * @throws NumberFormatException + * @throws IOException + */ + private void updateByRelation(JsonObject operationBody, boolean delete) + throws LuceneException, NumberFormatException, IOException { + for (DocumentMapping.ParentRelationship parentRelationship : DocumentMapping.relationships + .get(operationBody.getString("_index"))) { + long childId = operationBody.getJsonNumber("_id").longValueExact(); + IndexBucket bucket = indexBuckets.computeIfAbsent(parentRelationship.parentName.toLowerCase(), + k -> new IndexBucket(k)); + if (bucket.locked.get()) { + throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, + "Lucene locked for " + parentRelationship.parentName); + } + IndexSearcher searcher = getSearcher(new HashMap<>(), parentRelationship.parentName); + + int blockSize = 10000; + Query query = LongPoint.newExactQuery(parentRelationship.joiningField, childId); + Sort sort = new Sort(new SortField("id", Type.LONG)); + ScoreDoc[] scoreDocs = searcher.search(query, blockSize, sort).scoreDocs; + while (scoreDocs.length != 0) { + for (ScoreDoc scoreDoc : scoreDocs) { + Document oldDocument = searcher.doc(scoreDoc.doc); + long parentId = oldDocument.getField("id").numericValue().longValue(); + Document newDocument = delete ? pruneDocument(parentRelationship.fields, oldDocument) + : updateDocumentFields(operationBody.getJsonObject("doc"), oldDocument); + logger.trace("updateByRelation: {}", newDocument); + bucket.updateDocument(parentId, facetsConfig.build(newDocument)); + } + scoreDocs = searcher.searchAfter(scoreDocs[scoreDocs.length - 1], query, blockSize, sort).scoreDocs; + } + } + } + } diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java new file mode 100644 index 0000000..2c51f76 --- /dev/null +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -0,0 +1,916 @@ +package org.icatproject.lucene; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.StringReader; +import java.net.HttpURLConnection; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TimeZone; +import java.util.Map.Entry; + +import jakarta.json.Json; +import jakarta.json.JsonArray; +import jakarta.json.JsonNumber; +import jakarta.json.JsonObject; +import jakarta.json.JsonReader; +import jakarta.json.JsonString; +import jakarta.json.JsonValue; +import jakarta.json.JsonValue.ValueType; +import jakarta.servlet.http.HttpServletRequest; + +import org.apache.lucene.document.DoublePoint; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.facet.range.DoubleRange; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.facet.range.Range; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.flexible.core.QueryNodeException; +import org.apache.lucene.queryparser.flexible.core.QueryNodeParseException; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FieldDoc; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MatchNoDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.search.TermInSetQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery.Builder; +import org.apache.lucene.search.SortField.Type; +import org.apache.lucene.search.join.JoinUtil; +import org.apache.lucene.search.join.ScoreMode; +import org.apache.lucene.util.BytesRef; +import org.icatproject.lucene.exceptions.LuceneException; +import org.icatproject.utils.IcatUnits.Value; + +/** + * Bucket for information relating to a single search. + */ +public class SearchBucket { + + public enum SearchType { + DATAFILE, DATASET, INVESTIGATION, GENERIC + } + + private Lucene lucene; + public Map> searcherMap; + public Query query; + public Sort sort; + public FieldDoc searchAfter; + public boolean scored; + public Set fields = new HashSet<>(); + public Map> joinedFields = new HashMap<>(); + public Map dimensions = new HashMap<>(); + private static final SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmm"); + + static { + TimeZone tz = TimeZone.getTimeZone("GMT"); + df.setTimeZone(tz); + } + + /** + * Creates an empty search bucket. + * + * @param lucene IcatLucene instance. + */ + public SearchBucket(Lucene lucene) { + this.lucene = lucene; + searcherMap = new HashMap<>(); + } + + /** + * Creates a new search from the provided request and Url parameters. + * + * @param lucene IcatLucene instance. + * @param searchType The SearchType determines how the query is built for + * specific entities. + * @param request Incoming Http request containing the query as Json. + * @param sort Sort criteria as a Json encoded string. + * @param searchAfter The last FieldDoc of a previous search, encoded as Json. + * @throws LuceneException + * @throws IOException + * @throws QueryNodeException + */ + public SearchBucket(Lucene lucene, SearchType searchType, HttpServletRequest request, String sort, + String searchAfter) throws LuceneException, IOException, QueryNodeException { + this.lucene = lucene; + searcherMap = new HashMap<>(); + parseSort(sort); + try (JsonReader r = Json.createReader(request.getInputStream())) { + JsonObject o = r.readObject(); + parseFields(o); + parseDimensions(o); + JsonObject jsonQuery = o.getJsonObject("query"); + switch (searchType) { + case GENERIC: + parseGenericQuery(jsonQuery); + return; + case DATAFILE: + parseDatafileQuery(searchAfter, jsonQuery); + return; + case DATASET: + parseDatasetQuery(searchAfter, jsonQuery); + return; + case INVESTIGATION: + parseInvestigationQuery(searchAfter, jsonQuery); + return; + } + } catch (QueryNodeParseException e) { + String message = "Search term could not be parsed due to syntax errors"; + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, message); + } + } + + private void parseDatafileQuery(String searchAfter, JsonObject jsonQuery) + throws LuceneException, IOException, QueryNodeException { + BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); + parseSearchAfter(searchAfter); + buildFilterQueries("datafile", jsonQuery, luceneQuery); + + String userName = jsonQuery.getString("user", null); + if (userName != null) { + buildUserNameQuery(userName, luceneQuery, "investigation.id"); + } + + String text = jsonQuery.getString("text", null); + if (text != null) { + luceneQuery.add(DocumentMapping.datafileParser.parse(text, null), Occur.MUST); + } + + buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "date"); + + if (jsonQuery.containsKey("parameters")) { + JsonArray parameters = jsonQuery.getJsonArray("parameters"); + IndexSearcher datafileParameterSearcher = lucene.getSearcher(searcherMap, "DatafileParameter"); + for (JsonValue p : parameters) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("datafile.id", false, "id", Long.class, paramQuery.build(), + datafileParameterSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + } + query = maybeEmptyQuery(luceneQuery); + } + + private void parseDatasetQuery(String searchAfter, JsonObject jsonQuery) + throws LuceneException, IOException, QueryNodeException { + BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); + parseSearchAfter(searchAfter); + buildFilterQueries("dataset", jsonQuery, luceneQuery); + + String userName = jsonQuery.getString("user", null); + if (userName != null) { + buildUserNameQuery(userName, luceneQuery, "investigation.id"); + } + + String text = jsonQuery.getString("text", null); + if (text != null) { + luceneQuery.add(DocumentMapping.datasetParser.parse(text, null), Occur.MUST); + } + + buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "startDate", "endDate"); + + if (jsonQuery.containsKey("parameters")) { + JsonArray parameters = jsonQuery.getJsonArray("parameters"); + IndexSearcher parameterSearcher = lucene.getSearcher(searcherMap, "DatasetParameter"); + for (JsonValue p : parameters) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("dataset.id", false, "id", Long.class, paramQuery.build(), + parameterSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + } + query = maybeEmptyQuery(luceneQuery); + } + + private void parseInvestigationQuery(String searchAfter, JsonObject jsonQuery) + throws LuceneException, IOException, QueryNodeException { + BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); + parseSearchAfter(searchAfter); + buildFilterQueries("investigation", jsonQuery, luceneQuery); + + String userName = jsonQuery.getString("user", null); + if (userName != null) { + buildUserNameQuery(userName, luceneQuery, "id"); + } + + String text = jsonQuery.getString("text", null); + if (text != null) { + Builder textBuilder = new BooleanQuery.Builder(); + textBuilder.add(DocumentMapping.investigationParser.parse(text, null), Occur.SHOULD); + + IndexSearcher sampleSearcher = lucene.getSearcher(searcherMap, "Sample"); + Query joinedSampleQuery = JoinUtil.createJoinQuery("sample.investigation.id", false, "id", Long.class, + DocumentMapping.sampleParser.parse(text, null), sampleSearcher, ScoreMode.Avg); + textBuilder.add(joinedSampleQuery, Occur.SHOULD); + luceneQuery.add(textBuilder.build(), Occur.MUST); + } + + buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "startDate", "endDate"); + + if (jsonQuery.containsKey("parameters")) { + JsonArray parameters = jsonQuery.getJsonArray("parameters"); + IndexSearcher parameterSearcher = lucene.getSearcher(searcherMap, "InvestigationParameter"); + for (JsonValue p : parameters) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", Long.class, + paramQuery.build(), + parameterSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + } + + String userFullName = jsonQuery.getString("userFullName", null); + if (userFullName != null) { + BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); + userFullNameQuery.add(DocumentMapping.genericParser.parse(userFullName, "user.fullName"), + Occur.MUST); + IndexSearcher investigationUserSearcher = lucene.getSearcher(searcherMap, "InvestigationUser"); + Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", Long.class, + userFullNameQuery.build(), + investigationUserSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + query = maybeEmptyQuery(luceneQuery); + } + + /** + * Extracts values from queryJson in order to add one or more range query terms + * using queryBuilder. + * + * Note that values in queryJson are expected to be precise only to the minute, + * and so to ensure that our range is inclusive, we add 59.999 seconds onto the + * upper value only. + * + * If either upper or lower keys do not yield values then a half open range is + * created. If both are absent, then nothing is added to the query. + * + * @param queryBuilder Builder for the Lucene query. + * @param queryJson JsonObject representing the query parameters. + * @param lowerKey Key in queryJson of the lower date value + * @param upperKey Key in queryJson of the upper date value + * @param fields Name of one or more fields to apply the range query to. + * @throws LuceneException + */ + private void buildDateRanges(Builder queryBuilder, JsonObject queryJson, String lowerKey, String upperKey, + String... fields) throws LuceneException { + long lower = parseDate(queryJson, lowerKey, 0); + long upper = parseDate(queryJson, upperKey, 59999); + // Only build the query if at least one of the dates is defined + if (lower != Long.MIN_VALUE || upper != Long.MAX_VALUE) { + for (String field : fields) { + queryBuilder.add(LongPoint.newRangeQuery(field, lower, upper), Occur.MUST); + } + } + } + + /** + * Builds Term queries (exact string matches without tokenizing) Range queries + * or Nested/Joined queries from the filter + * object in the query request. + * + * @param requestedQuery Json object containing details of the query. + * @param queryBuilder Builder for the overall boolean query to be build. + * @throws LuceneException If the values in the filter object are neither STRING + * nor ARRAY of STRING. + * @throws IOException + */ + private void buildFilterQueries(String target, JsonObject requestedQuery, Builder queryBuilder) + throws LuceneException, IOException { + if (requestedQuery.containsKey("filter")) { + JsonObject filterObject = requestedQuery.getJsonObject("filter"); + for (String key : filterObject.keySet()) { + JsonValue value = filterObject.get(key); + ValueType valueType = value.getValueType(); + int i = key.indexOf("."); + String filterTarget = i == -1 ? key : key.substring(0, i); + String fld = key.substring(i + 1); + Query dimensionQuery; + if (valueType.equals(ValueType.ARRAY)) { + Builder builder = new BooleanQuery.Builder(); + // If the key was just a nested entity (no ".") then we should FILTER all of our + // queries on that entity. + Occur occur = i == -1 ? Occur.FILTER : Occur.SHOULD; + for (JsonValue arrayValue : filterObject.getJsonArray(key)) { + Query arrayQuery = parseFilter(target, fld, arrayValue); + builder.add(arrayQuery, occur); + } + dimensionQuery = builder.build(); + } else { + dimensionQuery = parseFilter(target, fld, value); + } + // Nest the dimension query if needed + if (i != -1 && !target.equals(filterTarget)) { + // If we are targeting a different entity, nest the entire array as SHOULD + // BUT only if we haven't already nested the queries (as we do when the key was + // just a nested entity) + IndexSearcher nestedSearcher = lucene.getSearcher(searcherMap, filterTarget); + Query nestedQuery; + if (filterTarget.equals("sample") && target.equals("investigation")) { + nestedQuery = JoinUtil.createJoinQuery("sample.investigation.id", false, "id", Long.class, + dimensionQuery, nestedSearcher, ScoreMode.None); + } else if (filterTarget.toLowerCase().equals("investigationinstrument") && !target.equals("investigation")) { + nestedQuery = JoinUtil.createJoinQuery("investigation.id", false, "investigation.id", Long.class, dimensionQuery, + nestedSearcher, ScoreMode.None); + } else { + nestedQuery = JoinUtil.createJoinQuery(target + ".id", false, "id", Long.class, dimensionQuery, + nestedSearcher, ScoreMode.None); + } + queryBuilder.add(nestedQuery, Occur.FILTER); + } else { + // Otherwise, just add as SHOULD to the main query directly + queryBuilder.add(dimensionQuery, Occur.FILTER); + } + } + } + } + + /** + * Parses a single filter field value pair into Lucene objects. Can handle + * simple strings, range objects or nested filters. + * + * @param target The target entity of the search, but not necessarily this + * filter + * @param fld The field to apply the query to + * @param value JsonValue (JsonString or JsonObject) to parse a Lucene Query + * from + * @return A Lucene Query object parsed from the provided value + * @throws IOException + * @throws LuceneException + */ + private Query parseFilter(String target, String fld, JsonValue value) throws IOException, LuceneException { + ValueType valueType = value.getValueType(); + switch (valueType) { + case STRING: + // Simplest case involving a single field/value pair + return new TermQuery(new Term(fld + ".keyword", ((JsonString) value).getString())); + + case OBJECT: + JsonObject valueObject = (JsonObject) value; + if (valueObject.containsKey("filter")) { + // Parse a nested query + IndexSearcher nestedSearcher = lucene.getSearcher(searcherMap, fld); + List nestedFilters = valueObject.getJsonArray("filter").getValuesAs(JsonObject.class); + Builder nestedBoolBuilder = new BooleanQuery.Builder(); + nestedFilters.forEach(nestedFilter -> { + String nestedField = nestedFilter.getString("field"); + if (nestedFilter.containsKey("value")) { + Term term = new Term(nestedField + ".keyword", nestedFilter.getString("value")); + TermQuery query = new TermQuery(term); + nestedBoolBuilder.add(query, Occur.FILTER); + } else if (nestedFilter.containsKey("exact")) { + buildNestedExactQuery(nestedField, nestedFilter, nestedBoolBuilder); + } else { + buildNestedRangeQuery(nestedField, nestedFilter, nestedBoolBuilder); + } + }); + if (fld.contains("sample") && !target.equals("investigation")) { + // Datasets and Datafiles join by sample.id on both fields + return JoinUtil.createJoinQuery("sample.id", false, "sample.id", Long.class, + nestedBoolBuilder.build(), nestedSearcher, ScoreMode.None); + } else if (fld.equals("sampleparameter") && target.equals("investigation")) { + Query sampleQuery = JoinUtil.createJoinQuery("sample.id", false, "sample.id", Long.class, + nestedBoolBuilder.build(), nestedSearcher, ScoreMode.None); + return JoinUtil.createJoinQuery("sample.investigation.id", false, "id", Long.class, sampleQuery, + lucene.getSearcher(searcherMap, "sample"), ScoreMode.None); + } else { + return JoinUtil.createJoinQuery(target + ".id", false, "id", Long.class, + nestedBoolBuilder.build(), nestedSearcher, ScoreMode.None); + } + } else { + // Single range of values for a field + JsonNumber from = valueObject.getJsonNumber("from"); + JsonNumber to = valueObject.getJsonNumber("to"); + if (DocumentMapping.longFields.contains(fld)) { + return LongPoint.newRangeQuery(fld, from.longValueExact(), to.longValueExact()); + } else { + return DoublePoint.newRangeQuery(fld, from.doubleValue(), to.doubleValue()); + } + } + + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "filter object values should be STRING or OBJECT, but were " + valueType); + } + } + + /** + * Builds an exact numeric query, intended for use with numeric or date/time + * parameters. + * + * @param fld Name of the field to apply the range to. + * @param valueObject JsonObject containing "exact", and optionally "units" + * as keys for an exact value. + * @param builder BooleanQuery.Builder for the nested query + */ + private void buildNestedExactQuery(String fld, JsonObject valueObject, BooleanQuery.Builder builder) { + if (DocumentMapping.longFields.contains(fld)) { + long exact = valueObject.getJsonNumber("exact").longValueExact(); + builder.add(LongPoint.newExactQuery(fld, exact), Occur.FILTER); + } else { + Builder rangeBuilder = new BooleanQuery.Builder(); + Builder exactOrRangeBuilder = new BooleanQuery.Builder(); + double exact = valueObject.getJsonNumber("exact").doubleValue(); + String units = valueObject.getString("units", null); + if (units != null) { + Value exactValue = lucene.icatUnits.convertValueToSiUnits(exact, units); + if (exactValue != null) { + // If we were able to parse the units, apply query to the SI value + Query topQuery = DoublePoint.newRangeQuery("rangeTopSI", exactValue.numericalValue, + Double.POSITIVE_INFINITY); + Query bottomQuery = DoublePoint.newRangeQuery("rangeBottomSI", Double.NEGATIVE_INFINITY, + exactValue.numericalValue); + Query exactQuery = DoublePoint.newExactQuery(fld + "SI", exactValue.numericalValue); + rangeBuilder.add(topQuery, Occur.FILTER); + rangeBuilder.add(bottomQuery, Occur.FILTER); + exactOrRangeBuilder.add(rangeBuilder.build(), Occur.SHOULD); + exactOrRangeBuilder.add(exactQuery, Occur.SHOULD); + builder.add(exactOrRangeBuilder.build(), Occur.FILTER); + } else { + // If units could not be parsed, make them part of the query on the raw data + rangeBuilder.add(DoublePoint.newRangeQuery("rangeTop", exact, Double.POSITIVE_INFINITY), + Occur.FILTER); + rangeBuilder.add(DoublePoint.newRangeQuery("rangeBottom", Double.NEGATIVE_INFINITY, exact), + Occur.FILTER); + exactOrRangeBuilder.add(rangeBuilder.build(), Occur.SHOULD); + exactOrRangeBuilder.add(DoublePoint.newExactQuery(fld, exact), Occur.SHOULD); + builder.add(exactOrRangeBuilder.build(), Occur.FILTER); + builder.add(new TermQuery(new Term("type.units", units)), Occur.FILTER); + } + } else { + // If units were not provided, just apply to the raw data + rangeBuilder.add(DoublePoint.newRangeQuery("rangeTop", exact, Double.POSITIVE_INFINITY), Occur.FILTER); + rangeBuilder.add(DoublePoint.newRangeQuery("rangeBottom", Double.NEGATIVE_INFINITY, exact), + Occur.FILTER); + exactOrRangeBuilder.add(rangeBuilder.build(), Occur.SHOULD); + exactOrRangeBuilder.add(DoublePoint.newExactQuery(fld, exact), Occur.SHOULD); + builder.add(exactOrRangeBuilder.build(), Occur.FILTER); + } + } + } + + /** + * Builds a range query, intended for use with numeric or date/time parameters. + * + * @param fld Name of the field to apply the range to. + * @param valueObject JsonObject containing "from", "to" and optionally "units" + * as keys for a range of values. + * @param builder BooleanQuery.Builder for the nested query + */ + private void buildNestedRangeQuery(String fld, JsonObject valueObject, BooleanQuery.Builder builder) { + if (DocumentMapping.longFields.contains(fld)) { + long from = Long.MIN_VALUE; + long to = Long.MAX_VALUE; + try { + from = valueObject.getJsonNumber("from").longValueExact(); + } catch (ArithmeticException e) { + // pass + } + try { + to = valueObject.getJsonNumber("to").longValueExact(); + } catch (ArithmeticException e) { + // pass + } + builder.add(LongPoint.newRangeQuery(fld, from, to), Occur.FILTER); + } else { + double from = valueObject.getJsonNumber("from").doubleValue(); + double to = valueObject.getJsonNumber("to").doubleValue(); + String units = valueObject.getString("units", null); + if (units != null) { + Value fromValue = lucene.icatUnits.convertValueToSiUnits(from, units); + Value toValue = lucene.icatUnits.convertValueToSiUnits(to, units); + if (fromValue != null && toValue != null) { + // If we were able to parse the units, apply query to the SI value + Query rangeQuery = DoublePoint.newRangeQuery(fld + "SI", fromValue.numericalValue, + toValue.numericalValue); + builder.add(rangeQuery, Occur.FILTER); + } else { + // If units could not be parsed, make them part of the query on the raw data + builder.add(DoublePoint.newRangeQuery(fld, from, to), Occur.FILTER); + builder.add(new TermQuery(new Term("type.units", units)), Occur.FILTER); + } + } else { + // If units were not provided, just apply to the raw data + builder.add(DoublePoint.newRangeQuery(fld, from, to), Occur.FILTER); + } + } + } + + /** + * Builds a query against InvestigationUser and InstrumentScientist entities + * using the provided userName. + * + * @param userName The value of the user.name field to query for. + * @param luceneQuery BooleanQuery.Builder in use for main entity query. + * @param toField The field on the main entity to join to, practically + * either "id" or "investigation.id". + * @throws IOException + * @throws LuceneException + */ + private void buildUserNameQuery(String userName, BooleanQuery.Builder luceneQuery, String toField) + throws IOException, LuceneException { + TermQuery fromQuery = new TermQuery(new Term("user.name", userName)); + Query investigationUserQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, Long.class, + fromQuery, lucene.getSearcher(searcherMap, "InvestigationUser"), ScoreMode.None); + Query instrumentScientistQuery = JoinUtil.createJoinQuery("instrument.id", false, "instrument.id", Long.class, + fromQuery, lucene.getSearcher(searcherMap, "InstrumentScientist"), ScoreMode.None); + Query investigationInstrumentQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, Long.class, + instrumentScientistQuery, lucene.getSearcher(searcherMap, "InvestigationInstrument"), ScoreMode.None); + Builder userNameQueryBuilder = new BooleanQuery.Builder(); + userNameQueryBuilder.add(investigationUserQuery, Occur.SHOULD).add(investigationInstrumentQuery, Occur.SHOULD); + luceneQuery.add(userNameQueryBuilder.build(), Occur.MUST); + } + + /** + * Converts String into number of ms since epoch. + * + * @param value String representing a Date in the format "yyyyMMddHHmm". + * @return Number of ms since epoch. + * @throws java.text.ParseException + */ + protected static long decodeTime(String value) throws java.text.ParseException { + synchronized (df) { + return df.parse(value).getTime(); + } + } + + /** + * Either builds the query from the provided builder, or creates a + * MatchAllDocsQuery to use if the Builder was empty. + * + * @param luceneQuery BooleanQuery.Builder + * @return Lucene Query + */ + private Query maybeEmptyQuery(Builder luceneQuery) { + Query query = luceneQuery.build(); + if (query.toString().isEmpty()) { + query = new MatchAllDocsQuery(); + } + return query; + } + + /** + * Parses a date/time value from jsonObject. Can account for either a Long + * value, or a String value encoded in the format yyyyMMddHHmm. + * + * @param jsonObject JsonObject containing the date to be parsed. + * @param key Key of the date/time value in jsonObject. + * @param offset In the case of STRING ValueType, add offset ms before + * returning. This accounts for the fact the String format + * used is only precise to minutes and not seconds. + * @return null if jsonObject does not contain the key, number of ms since epoch + * otherwise. + * @throws LuceneException If the ValueType is not NUMBER or STRING, or if a + * STRING value cannot be parsed. + */ + private long parseDate(JsonObject jsonObject, String key, int offset) throws LuceneException { + if (jsonObject.containsKey(key)) { + ValueType valueType = jsonObject.get(key).getValueType(); + switch (valueType) { + case STRING: + String dateString = jsonObject.getString(key); + try { + return decodeTime(dateString) + offset; + } catch (Exception e) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Could not parse date " + dateString + " using expected format yyyyMMddHHmm"); + } + case NUMBER: + return jsonObject.getJsonNumber(key).longValueExact(); + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Dates should be represented by a NUMBER or STRING JsonValue, but got " + valueType); + } + } + // If the key wasn't present, use eiter MIN_VALUE or MAX_VALUE based on whether + // we need to offset the date. This is useful for half open ranges. + if (offset == 0) { + return Long.MIN_VALUE; + } else { + return Long.MAX_VALUE; + } + } + + /** + * Parses dimensions to apply faceting to from the incoming Json. If ranges are + * specified, these are also parsed. + * + * @param jsonObject Json from incoming search request. + * @throws LuceneException + */ + private void parseDimensions(JsonObject jsonObject) throws LuceneException { + if (jsonObject.containsKey("dimensions")) { + List dimensionObjects = jsonObject.getJsonArray("dimensions").getValuesAs(JsonObject.class); + for (JsonObject dimensionObject : dimensionObjects) { + if (!dimensionObject.containsKey("dimension")) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "'dimension' not specified for facet request " + dimensionObject); + } + String dimension = dimensionObject.getString("dimension"); + FacetedDimension facetDimensionRequest = new FacetedDimension(dimension); + if (dimensionObject.containsKey("ranges")) { + List ranges = facetDimensionRequest.getRanges(); + List jsonRanges = dimensionObject.getJsonArray("ranges").getValuesAs(JsonObject.class); + if (DocumentMapping.longFields.contains(dimension)) { + for (JsonObject range : jsonRanges) { + long lower = Long.MIN_VALUE; + long upper = Long.MAX_VALUE; + if (range.containsKey("from")) { + lower = range.getJsonNumber("from").longValueExact(); + } + if (range.containsKey("to")) { + upper = range.getJsonNumber("to").longValueExact(); + } + String label = lower + "-" + upper; + if (range.containsKey("key")) { + label = range.getString("key"); + } + ranges.add(new LongRange(label, lower, true, upper, false)); + } + } else if (DocumentMapping.doubleFields.contains(dimension)) { + for (JsonObject range : jsonRanges) { + double lower = Double.MIN_VALUE; + double upper = Double.MAX_VALUE; + if (range.containsKey("from")) { + lower = range.getJsonNumber("from").doubleValue(); + } + if (range.containsKey("to")) { + upper = range.getJsonNumber("to").doubleValue(); + } + String label = lower + "-" + upper; + if (range.containsKey("key")) { + label = range.getString("key"); + } + ranges.add(new DoubleRange(label, lower, true, upper, false)); + } + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "'ranges' specified for dimension " + dimension + + " but this is not a supported numeric field"); + } + } + dimensions.put(dimension, facetDimensionRequest); + } + } + } + + /** + * Parses the fields to return with the search results from Json. + * + * @param jsonObject The Json from the search request. + * @throws LuceneException If the parsing fails. + */ + public void parseFields(JsonObject jsonObject) throws LuceneException { + if (jsonObject.containsKey("fields")) { + List fieldStrings = jsonObject.getJsonArray("fields").getValuesAs(JsonString.class); + // logger.trace("Parsing fields from {}", fieldStrings); + for (JsonString jsonString : fieldStrings) { + String[] splitString = jsonString.getString().split(" "); + if (splitString.length == 1) { + // Fields without a space apply directly to the target entity + fields.add(splitString[0]); + } else if (splitString.length == 2) { + // Otherwise, the first element is the target of a join, with the second being a + // field on that joined entity. + if (joinedFields.containsKey(splitString[0])) { + joinedFields.get(splitString[0]).add(splitString[1]); + } else { + joinedFields.putIfAbsent(splitString[0], + new HashSet(Arrays.asList(splitString[1]))); + } + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Could not parse field: " + jsonString.getString()); + } + } + } + } + + /** + * Parses a query and associated information from an incoming request without + * any logic specific to a single index or entity. As such it may not be as + * powerful, but is sufficient for simple queries (like those for faceting). + * + * @param jsonQuery Incoming query request encoded as Json. + * @param luceneQuery Lucene BooleanQuery.Builder + * @throws LuceneException If the types of the JsonValues in the query do not + * match those supported by icat.lucene + */ + private void parseGenericQuery(JsonObject jsonQuery) throws LuceneException { + BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); + for (Entry entry : jsonQuery.entrySet()) { + String field = entry.getKey(); + ValueType valueType = entry.getValue().getValueType(); + switch (valueType) { + case STRING: + JsonString stringValue = (JsonString) entry.getValue(); + String fld = lucene.facetFields.contains(field) ? field + ".keyword" : field; + luceneQuery.add(new TermQuery(new Term(fld, stringValue.getString())), Occur.MUST); + break; + case NUMBER: + JsonNumber numberValue = (JsonNumber) entry.getValue(); + if (DocumentMapping.longFields.contains(field)) { + luceneQuery.add(LongPoint.newExactQuery(field, numberValue.longValueExact()), Occur.FILTER); + } else if (DocumentMapping.doubleFields.contains(field)) { + luceneQuery.add(DoublePoint.newExactQuery(field, numberValue.doubleValue()), Occur.FILTER); + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Value had type NUMBER, but field " + field + + " is not a known longField or doubleField"); + } + break; + case ARRAY: + ArrayList longList = new ArrayList<>(); + ArrayList bytesRefList = new ArrayList<>(); + JsonArray arrayValue = (JsonArray) entry.getValue(); + for (JsonValue value : arrayValue) { + ValueType arrayValueType = value.getValueType(); + switch (arrayValueType) { + case NUMBER: + longList.add(((JsonNumber) value).longValueExact()); + break; + default: + bytesRefList.add(new BytesRef(((JsonString) value).getString())); + break; + } + } + + if (longList.size() == 0 && bytesRefList.size() == 0) { + query = new MatchNoDocsQuery("Tried filtering" + field + " with an empty array"); + return; + } + if (longList.size() != 0) { + luceneQuery.add(LongPoint.newSetQuery(field, longList), Occur.MUST); + } + if (bytesRefList.size() != 0) { + luceneQuery.add(new TermInSetQuery(field, bytesRefList), Occur.MUST); + } + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Query values should be ARRAY, STRING or NUMBER, but had value of type " + valueType); + } + } + query = maybeEmptyQuery(luceneQuery); + } + + /** + * Parses query applying to a single parameter from incoming Json. + * + * @param p JsonValue (JsonObject) representing a query against a single + * parameter. + * @return BooleanQuery.Builder for a single parameter. + * @throws LuceneException + */ + private Builder parseParameter(JsonValue p) throws LuceneException { + JsonObject parameter = (JsonObject) p; + BooleanQuery.Builder paramQuery = new BooleanQuery.Builder(); + String pName = parameter.getString("name", null); + if (pName != null) { + paramQuery.add(new WildcardQuery(new Term("type.name.keyword", pName)), Occur.MUST); + } + + String pUnits = parameter.getString("units", null); + if (pUnits != null) { + paramQuery.add(new WildcardQuery(new Term("type.units", pUnits)), Occur.MUST); + } + if (parameter.containsKey("stringValue")) { + String pStringValue = parameter.getString("stringValue", null); + paramQuery.add(new WildcardQuery(new Term("stringValue", pStringValue)), Occur.MUST); + } else if (parameter.containsKey("lowerDateValue") && parameter.containsKey("upperDateValue")) { + buildDateRanges(paramQuery, parameter, "lowerDateValue", "upperDateValue", "dateTimeValue"); + } else if (parameter.containsKey("lowerNumericValue") && parameter.containsKey("upperNumericValue")) { + double pLowerNumericValue = parameter.getJsonNumber("lowerNumericValue").doubleValue(); + double pUpperNumericValue = parameter.getJsonNumber("upperNumericValue").doubleValue(); + paramQuery.add(DoublePoint.newRangeQuery("numericValue", pLowerNumericValue, pUpperNumericValue), + Occur.MUST); + } + return paramQuery; + } + + /** + * Parses a Lucene FieldDoc to be "searched after" from a String representation + * of a JSON array. null if searchAfter was itself null or an empty String. + * + * @param searchAfter String representation of a JSON object containing the + * document id or "doc" (String), score ("float") in that + * order. + * @return FieldDoc object built from the provided String, or + * @throws LuceneException If an entry in the fields array is not a STRING or + * NUMBER + */ + private void parseSearchAfter(String searchAfter) throws LuceneException { + if (searchAfter == null || searchAfter.equals("")) { + return; + } + SortField[] sortFields = sort.getSort(); + JsonReader reader = Json.createReader(new StringReader(searchAfter)); + JsonObject object = reader.readObject(); + // shardIndex and Lucene doc Id are always needed to determine tie breaks, even + // if the field sort resulted in no ties in the first place + int shardIndex = object.getInt("shardIndex"); + int doc = object.getInt("doc"); + float score = Float.NaN; + List fields = new ArrayList<>(); + if (object.containsKey("score")) { + score = object.getJsonNumber("score").bigDecimalValue().floatValue(); + } + if (object.containsKey("fields")) { + JsonArray jsonArray = object.getJsonArray("fields"); + if (jsonArray.size() != sortFields.length) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "fields should have the same length as sort, but they were " + + jsonArray.size() + " and " + sortFields.length); + } + for (int i = 0; i < sortFields.length; i++) { + JsonValue value = jsonArray.get(i); + switch (value.getValueType()) { + case NUMBER: + JsonNumber number = ((JsonNumber) value); + switch (sortFields[i].getType()) { + case FLOAT: + case DOUBLE: + case SCORE: + fields.add(number.bigDecimalValue().floatValue()); + break; + case INT: + case LONG: + case DOC: + case CUSTOM: + fields.add(number.longValueExact()); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "fields contained a NUMBER but the corresponding field was " + + sortFields[i]); + } + break; + case STRING: + fields.add(new BytesRef(((JsonString) value).getString())); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "fields should be an array of STRING and NUMBER, but had entry of type " + + value.getValueType()); + } + } + } + this.searchAfter = new FieldDoc(doc, score, fields.toArray(), shardIndex); + } + + /** + * Parses the String from the request into a Lucene Sort object. Multiple sort + * criteria are supported, and will be applied in order. + * + * @param sortString String representation of a JSON object with the field(s) to + * sort + * as keys, and the direction ("asc" or "desc") as value(s). + * @return Lucene Sort object + * @throws LuceneException If the value for any key isn't "asc" or "desc" + */ + public void parseSort(String sortString) throws LuceneException { + if (sortString == null || sortString.equals("") || sortString.equals("{}")) { + scored = true; + sort = new Sort(SortField.FIELD_SCORE, new SortedNumericSortField("id", Type.LONG)); + return; + } + try (JsonReader reader = Json.createReader(new ByteArrayInputStream(sortString.getBytes()))) { + JsonObject object = reader.readObject(); + List fields = new ArrayList<>(); + for (String key : object.keySet()) { + String order = object.getString(key); + boolean reverse; + if (order.equals("asc")) { + reverse = false; + } else if (order.equals("desc")) { + reverse = true; + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Sort order must be 'asc' or 'desc' but it was '" + order + "'"); + } + + if (DocumentMapping.longFields.contains(key)) { + fields.add(new SortedNumericSortField(key, Type.LONG, reverse)); + } else if (DocumentMapping.doubleFields.contains(key)) { + fields.add(new SortedNumericSortField(key, Type.DOUBLE, reverse)); + } else { + fields.add(new SortField(key, Type.STRING, reverse)); + } + } + fields.add(new SortedNumericSortField("id", Type.LONG)); + scored = false; + sort = new Sort(fields.toArray(new SortField[0])); + } + } +} diff --git a/src/main/resources/run.properties b/src/main/resources/run.properties index b010790..0e3c2ed 100644 --- a/src/main/resources/run.properties +++ b/src/main/resources/run.properties @@ -1,6 +1,16 @@ # Real comments in this file are marked with '#' whereas commented out lines # are marked with '!' -directory = ${HOME}/data/lucene -commitSeconds = 5 -ip = 127.0.0.1/32 +directory = ${HOME}/data/search +commitSeconds = 5 +maxShardSize = 2147483648 +ip = 127.0.0.1/32 +# A search taking longer than this will be cancelled to avoid blocking other users' searches +maxSearchTimeSeconds = 5 +# List of units to enable conversion to SI units when querying on numerical parameters +units = J: eV 1.602176634e-19; \u2103: celsius, degC; K: kelvin +# List of fields that should be stored for facet filtering when searching +# In order to be available, these fields must be set when indexing the data +facetFields = datafileFormat.name instrument.name sample.type.name stringValue technique.name type.name +# Aggregate file sizes and counts in real time (this will have a performance impact on write operations) +aggregateFiles = false diff --git a/src/main/resources/synonym.txt b/src/main/resources/synonym.txt new file mode 100755 index 0000000..5e633da --- /dev/null +++ b/src/main/resources/synonym.txt @@ -0,0 +1,246 @@ +# Synonyms to be applied after stemming according to the Porter algorithm + +# Alternate spellings +ionise, ionize + +# Elements +Hydrogen, H +Helium, He +Lithium, Li +Beryllium, Be +Boron, B +Carbon, C +Nitrogen, N +Oxygen, O +Fluorine, F +Neon, Ne +Sodium,Na +Magnesium,Mg +Aluminum,Al +Silicon,Si +Phosphorus,P +Sulfur, Sulphur,S +Chlorine,Cl +Argon,Ar +Potassium,K +Calcium,Ca +Scandium,Sc +Titanium,Ti +Vanadium,V +Chromium,Cr +Manganese,Mn +Iron,Fe +Cobalt,Co +Nickel,Ni +Copper,Cu +Zinc,Zn +Gallium,Ga +Germanium,Ge +Arsenic,As +Selenium,Se +Bromine,Br +Krypton,Kr +Rubidium,Rb +Strontium,Sr +Yttrium,Y +Zirconium,Zr +Niobium,Nb +Molybdenum,Mo +Technetium,Tc +Ruthenium,Ru +Rhodium,Rh +Palladium,Pd +Silver,Ag +Cadmium,Cd +Indium,In +Tin,Sn +Antimony,Sb +Tellurium,Te +Iodine,I +Xenon,Xe +Caesium, Cesium, Cs +Barium,Ba +Lanthanum,La +Cerium,Ce +Praseodymium,Pr +Neodymium,Nd +Promethium,Pm +Samarium,Sm +Europium,Eu +Gadolinium,Gd +Terbium,Tb +Dysprosium,Dy +Holmium,Ho +Erbium,Er +Thulium,Tm +Ytterbium,Yb +Lutetium,Lu +Hafnium,Hf +Tantalum,Ta +Tungsten, Wolfram,W +Rhenium,Re +Osmium,Os +Iridium,Ir +Platinum,Pt +Gold,Au +Mercury,Hg +Thallium,Tl +Lead,Pb +Bismuth,Bi +Polonium,Po +Astatine,At +Radon,Rn +Francium,Fr +Radium,Ra +Actinium,Ac +Thorium,Th +Protactinium,Pa +Uranium,U +Neptunium,Np +Plutonium,Pu +Americium,Am +Curium,Cm +Berkelium,Bk +Californium,Cf +Einsteinium,Es +Fermium,Fm +Mendelevium,Md +Nobelium,No +Lawrencium,Lr +Rutherfordium,Rf +Dubnium,Db +Seaborgium,Sg +Bohrium,Bh +Hassium,Hs +Meitnerium, Mt +Darmstadtium ,Ds +Roentgenium ,Rg +Copernicium ,Cn +Nihonium,Nh +Flerovium,Fl +Moscovium,Mc +Livermorium,Lv +Tennessine,Ts +Oganesson,Og + +# Techniques +forward scattering technique, propagation technique => forward scattering technique, propagation technique +TOF, time of flight technique => TOF, time of flight technique +femtosecond probe, ultrafast probe => femtosecond probe, ultrafast probe +MuSR, muon spin resonance => MuSR, muon spin resonance +crystallography, obtain crystal structure => crystallography, obtain crystal structure +time dependent study, time resolved study => time dependent study, time resolved study +ARPES, angle resolved photoemission spectroscopy => ARPES, angle resolved photoemission spectroscopy +GISAS, grazing incidence SAS, grazing incidence small angle scattering => GISAS, grazing incidence SAS, grazing incidence small angle scattering +NPD, neutron powder diffraction => NPD, neutron powder diffraction +XPD, x-ray powder diffraction => XPD, x-ray powder diffraction +SXRD, single crystal x-ray diffraction, x-ray single crystal diffraction => SXRD, single crystal x-ray diffraction, x-ray single crystal diffraction +HAXPES, hard photoelectron spectroscopy, hard x-ray photoelectron spectroscopy => HAXPES, hard photoelectron spectroscopy, hard x-ray photoelectron spectroscopy +inelastic SAS, inelastic small angle scatteringng => inelastic SAS, inelastic small angle scatteringng +IR spectroscopy, infrared spectroscopy => IR spectroscopy, infrared spectroscopy +Micro XRF, fluorescence microscopy, microfluorescence => Micro XRF, fluorescence microscopy, microfluorescence +PCS, photon correlation spectroscopy => PCS, photon correlation spectroscopy +quasi elastic spin echo, quasielastic neutron spin echo scattering, quasielastic spin echo => quasi elastic spin echo, quasielastic neutron spin echo scattering, quasielastic spin echo +reflectivity, reflectometry => reflectivity, reflectometry +anomalous diffraction, anomalous scattering, resonant diffraction => anomalous diffraction, anomalous scattering, resonant diffraction +STM, scanning transmission microscopy => STM, scanning transmission microscopy +SAS, small angle diffraction, small angle scattering => SAS, small angle diffraction, small angle scattering +spin echo SANS, spin echo small angle scattering => spin echo SANS, spin echo small angle scattering +UV circular dichroism, UVCD => UV circular dichroism, UVCD +USAS, ultra small angle scattering => USAS, ultra small angle scattering +diffraction imaging, topography => diffraction imaging, topography +XMCD, x-ray magnetic circular dichroism => XMCD, x-ray magnetic circular dichroism +LD, linear dichroism => LD, linear dichroism +XEOL, x-ray excited optical luminescence => XEOL, x-ray excited optical luminescence +MCD, magnetic circular dichroism => MCD, magnetic circular dichroism +MChD, magnetochiral dichroism => MChD, magnetochiral dichroism +NCD, natural circular dichroism => NCD, natural circular dichroism +EM, electron microscopy => EM, electron microscopy +PEEM, photoemission electron microscopy, photoemission microscopy => PEEM, photoemission electron microscopy, photoemission microscopy +scanning microscopy, scanning probe microscopy => scanning microscopy, scanning probe microscopy +XRR, x-ray reflectivity, x-ray reflectometry => XRR, x-ray reflectivity, x-ray reflectometry +EDD, energy dispersive diffraction => EDD, energy dispersive diffraction +EDXRD, energy dispersive x-ray diffraction => EDXRD, energy dispersive x-ray diffraction +GIXD, grazing incidence x-ray diffraction => GIXD, grazing incidence x-ray diffraction +GISAXS, grazing incidence small angle x-ray scattering => GISAXS, grazing incidence small angle x-ray scattering +Diffraction, high pressure single crystal diffraction => Diffraction, high pressure single crystal diffraction +MX, macromolecular crystallography, protein crystallography => MX, macromolecular crystallography, protein crystallography +MAD, multi wavelength anomalous diffraction, multi wavelength anomalous dispersion => MAD, multi wavelength anomalous diffraction, multi wavelength anomalous dispersion +PhD, photoelectron diffraction => PhD, photoelectron diffraction +SFX, serial femtosecond crystallography => SFX, serial femtosecond crystallography +SSX, serial synchrotron crystallography => SSX, serial synchrotron crystallography +SAD, single wavelength anomalous diffraction, single wavelength anomalous dispersion => SAD, single wavelength anomalous diffraction, single wavelength anomalous dispersion +chemical crystallography, small molecule crystallography, small molecule diffraction => chemical crystallography, small molecule crystallography, small molecule diffraction +XSW, x-ray standing wave => XSW, x-ray standing wave +CDI, coherent diffraction imaging, coherent diffractive imaging => CDI, coherent diffraction imaging, coherent diffractive imaging +infrared nanospectroscopy imaging, nano infrared spectroscopy => infrared nanospectroscopy imaging, nano infrared spectroscopy +XRF, x-ray fluorescence => XRF, x-ray fluorescence +IR microscopy, infrared microscopy => IR microscopy, infrared microscopy +PDF, pair distribution function => PDF, pair distribution function +IXS, inelastic x-ray scattering => IXS, inelastic x-ray scattering +RIXS, resonant inelastic x-ray scattering => RIXS, resonant inelastic x-ray scattering +RXS, resonant x-ray scattering => RXS, resonant x-ray scattering +RSXS, resonant soft x-ray scattering => RSXS, resonant soft x-ray scattering +SAXS, small angle x-ray scattering => SAXS, small angle x-ray scattering +SANS, small angle neutron scattering => SANS, small angle neutron scattering +WAXS, wide angle x-ray scattering => WAXS, wide angle x-ray scattering +CD, circular dichroism => CD, circular dichroism +EDX, energy dispersive x-ray spectroscopy => EDX, energy dispersive x-ray spectroscopy +XAS, x-ray absorption spectroscopy => XAS, x-ray absorption spectroscopy +XAFS, x-ray absorption fine structure => XAFS, x-ray absorption fine structure +EXAFS, extended x-ray absorption fine structure => EXAFS, extended x-ray absorption fine structure +NEXAFS, XANES, x-ray absorption near edge structure => NEXAFS, XANES, x-ray absorption near edge structure +XES, x-ray emission spectroscopy => XES, x-ray emission spectroscopy +PES, photoelectron spectroscopy => PES, photoelectron spectroscopy +XPS, x-ray photoelectron spectroscopy => XPS, x-ray photoelectron spectroscopy +XPCS, x-ray photon correlation spectroscopy => XPCS, x-ray photon correlation spectroscopy +CT scan, x-ray tomography => CT scan, x-ray tomography +Absorption-based tomographic microscopy, absorption microtomography => Absorption-based tomographic microscopy, absorption microtomography +Ultra-fast tomographic microscopy, ultrafast microtomography => Ultra-fast tomographic microscopy, ultrafast microtomography +XRD, x-ray diffraction => XRD, x-ray diffraction +STXM, scanning transmission x-ray microscopy => STXM, scanning transmission x-ray microscopy +TEY, total electron yield => TEY, total electron yield +XMCD TEY, XMCD total electron yield => XMCD TEY, XMCD total electron yield +neutron reflectivity, neutron reflectometry => neutron reflectivity, neutron reflectometry +USAXS, ultra small angle x-ray scattering => USAXS, ultra small angle x-ray scattering +polarized neutron reflectivity, polarized neutron reflectometry => polarized neutron reflectivity, polarized neutron reflectometry +TOF spectrometry, TOF spectroscopy, time-of-flight spectrometry => TOF spectrometry, TOF spectroscopy, time-of-flight spectrometry +inelastic neutron scattering, inelastic neutron scattering spectroscopy, inelastic neutron spectroscopy => inelastic neutron scattering, inelastic neutron scattering spectroscopy, inelastic neutron spectroscopy +XMLD, x-ray magnetic linear dichroism => XMLD, x-ray magnetic linear dichroism +REXS, resonant elastic x-ray scattering => REXS, resonant elastic x-ray scattering +x-ray refraction imaging, x-ray refraction radiography => x-ray refraction imaging, x-ray refraction radiography +time dependent scattering, time resolved scattering => time dependent scattering, time resolved scattering +time dependent diffraction, time resolved diffraction => time dependent diffraction, time resolved diffraction +time dependent absorption, time resolved absorption => time dependent absorption, time resolved absorption +ASAXS, anomalous small angle x-ray scattering => ASAXS, anomalous small angle x-ray scattering +ASAX, anomalous solution x-ray scattering => ASAX, anomalous solution x-ray scattering +GISANS, grazing incidence small angle neutron scattering => GISANS, grazing incidence small angle neutron scattering +VSANS, very small angle neutron scattering => VSANS, very small angle neutron scattering +micro SAXS tomography, micro small angle x-ray scattering tomography => micro SAXS tomography, micro small angle x-ray scattering tomography +micro GISAXS tomography, micro grazing incidence small angle x-ray scattering tomography => micro GISAXS tomography, micro grazing incidence small angle x-ray scattering tomography +nano ARPES, nano angle resolved photoemission spectroscopy => nano ARPES, nano angle resolved photoemission spectroscopy +scanning x-ray microscopy, x-ray scanning microscopy => scanning x-ray microscopy, x-ray scanning microscopy +HR-XPS, high resolution x-ray photoelectron spectroscopy => HR-XPS, high resolution x-ray photoelectron spectroscopy +RENS, elastic neutron scattering spectroscopy, resolution elastic neutron scattering => RENS, elastic neutron scattering spectroscopy, resolution elastic neutron scattering +XMChiD, x-ray magnetochiral dichroism => XMChiD, x-ray magnetochiral dichroism +XNCD, x-ray natural circular dichroism => XNCD, x-ray natural circular dichroism +XNLD, x-ray natural linear dichroism => XNLD, x-ray natural linear dichroism +crystallographic fragment screening, fragment screening => crystallographic fragment screening, fragment screening +microfocus MX, microfocus macromolecular crystallography => microfocus MX, microfocus macromolecular crystallography +nanofocus MX, nanofocus macromolecular crystallography => nanofocus MX, nanofocus macromolecular crystallography +MR, molecular replacement => MR, molecular replacement +TR-SFX, time resolved serial femtosecond crystallography => TR-SFX, time resolved serial femtosecond crystallography +FT-SSX, fixed target serial synchrotron crystallography => FT-SSX, fixed target serial synchrotron crystallography +LCP-SSX, lipidic cubic phase serial synchrotron crystallography => LCP-SSX, lipidic cubic phase serial synchrotron crystallography +TR-SSX, time resolved serial synchrotron crystallography => TR-SSX, time resolved serial synchrotron crystallography +CLXM, correlative light x-ray microscopy => CLXM, correlative light x-ray microscopy +GIWAXS, grazing incidence wide angle scattering => GIWAXS, grazing incidence wide angle scattering +HR-ARPES, high resolution angle resolved photoemission spectroscopy => HR-ARPES, high resolution angle resolved photoemission spectroscopy +AFM, atomic force microscopy => AFM, atomic force microscopy +AFM-IR, atomic force microscope infrared spectroscopy => AFM-IR, atomic force microscope infrared spectroscopy +FTIR, fourier transform infrared spectroscopy => FTIR, fourier transform infrared spectroscopy +ED-EXAFS, EDE, energy dispersive extended x-ray absorption fine structure => ED-EXAFS, EDE, energy dispersive extended x-ray absorption fine structure +radiation therapy, radiotherapy => radiation therapy, radiotherapy +obtain surface atomic structure, surface crystallography => obtain surface atomic structure, surface crystallography +XBI, x-ray birefringence imaging => XBI, x-ray birefringence imaging + diff --git a/src/main/scripts/parse_synonyms.py b/src/main/scripts/parse_synonyms.py new file mode 100644 index 0000000..d23d2a4 --- /dev/null +++ b/src/main/scripts/parse_synonyms.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 + +import csv +import sys +from typing import Dict, List + + +def add_to_parents( + relationships: Dict[str, Dict[str, List[str]]], + label: str, + parents: List[str], + child_depth: int +): + """ + Adds the `label` to all the entries in `relationships` that have a key in + `parents`, then recursively calls itself to add `label` to any + grandparents. `child_depth` is decreased by 1 for each generation to + prevent exponentially large injections. + + Parameters + ---------- + relationships: Dict[str, Dict[str, List[str]]] + Maps terms to an inner dictionary containing arrays for "alternatives", + "parents", and "children". + label: str + The term to be added to its `parents`. + parents: List[str] + The direct parents of the current `label` + child_depth: int + The number of generations of children to inject for each term. + For example, a value of 2 would inject children and their children. + 0 will only add alternative terms. Negative integers will add all + children, grandchildren, etc. Note that this may result in an + exponentially large number of terms + """ + if child_depth != 0: + for parent in parents: + try: + relationships[parent]["children"].append(label) + # If the parent is equivalent to anything, also add label as a + # child of the equivalent_parent + for equivalent_parent in relationships[parent]["equivalent"]: + relationships[equivalent_parent]["children"].append(label) + add_to_parents( + relationships, + label, + relationships[parent]["parents"], + child_depth - 1, + ) + except KeyError: + pass + + +def main(input_file: str, output_file: str, mode: str, max_child_depth: int): + """ + Reads an CSV file of terminology and writes it into Solr synonym format + for use in synonym injection. Alternative terms are always written, and the + number of child terms is configurable by `max_child_depth`. + + Parameters + ---------- + input_file: str + CSV file to read ontology from. + output_file: str + Solr synonym output file. + mode: str + Python file mode (w, a, ...) to use when writing the output file. + max_child_depth: int + The maximum number of generations of children to inject for each term. + For example, a value of 2 would inject children and their children. + 0 will only add alternative terms. Negative integers will add all + children, grandchildren, etc. Note that this may result in an + exponentially large number of terms + """ + alt_indices = [] + parent_indices = [] + equivalent_indices = [] + equivalent_pairs = {} + relationships = {} + with open(input_file) as f: + reader = csv.reader(f) + + # Dynamically determine header positions + headers = next(reader) + for i, header in enumerate(headers): + if "Label" == header.strip(): + label_index = i + elif "Alt Label" in header.strip(): + alt_indices.append(i) + elif "Parent IRI" == header.strip(): + parent_indices.append(i) + elif "Equivalent" == header.strip(): + equivalent_indices.append(i) + + for entries in reader: + try: + int(entries[0]) + except (ValueError, IndexError): + # If we do not have an ID, continue to the next line + continue + + label = entries[label_index] + if label in relationships.keys(): + raise ValueError(f"Duplicate entry for label {label}") + + relationships[label] = { + "alternatives": [], + "parents": [], + "equivalent": [], + "children": [], + } + for alt_index in alt_indices: + alternative_label = entries[alt_index] + if alternative_label: + relationships[label]["alternatives"].append( + alternative_label + ) + for parent_index in parent_indices: + parent = entries[parent_index] + if parent: + relationships[label]["parents"].append(parent) + for equivalent_index in equivalent_indices: + equivalent_label = entries[equivalent_index] + if equivalent_label: + relationships[label]["equivalent"].append(equivalent_label) + equivalent_pairs[equivalent_label] = label + + # If A is equivalent to B, then also set B equivalent to A + # This ensures they share all children + for key, value in equivalent_pairs.items(): + try: + relationships[key]["equivalent"].append(value) + except KeyError: + pass + + print(f"{len(relationships)} relationships found") + for label, relationship in relationships.items(): + add_to_parents( + relationships, label, relationship["parents"], max_child_depth + ) + + output = "" + for label, relationship in relationships.items(): + # Only write to file if we have alternative or child terms + if (len(relationship["alternatives"]) > 0 + or len(relationship["children"]) > 0): + left_hand_side = ", ".join( + sorted(set([label] + relationship["alternatives"])) + ) + right_hand_side = ", ".join( + sorted(set( + [label] + + relationship["alternatives"] + + relationship["children"] + )) + ) + output += left_hand_side + " => " + right_hand_side + "\n" + + with open(output_file, mode) as f: + f.write(output) + + +if __name__ == "__main__": + args = sys.argv + try: + input_file = args[1] + except IndexError as e: + raise IndexError("input_file to parse not provided") from e + try: + output_file = args[2] + except IndexError as e: + raise IndexError("output_file to write to not provided") from e + try: + mode = args[3] + except IndexError: + # Default to appending to the output_file (no overwrite) + mode = "a" + try: + max_child_depth = int(args[4]) + except (IndexError, ValueError): + # Default to 0 depth (only alternative terms) + max_child_depth = 0 + + main(input_file, output_file, mode, max_child_depth) diff --git a/src/site/xhtml/installation.xhtml.vm b/src/site/xhtml/installation.xhtml.vm index 37ec5ef..8d4f801 100644 --- a/src/site/xhtml/installation.xhtml.vm +++ b/src/site/xhtml/installation.xhtml.vm @@ -56,6 +56,11 @@
the interval in seconds between committing lucene changes to disk and updating the index.
+
maxShardSize
+
The maximum number of documents to store in a single index before "sharding" + into an additional index. All sharded indices are searched at once when + performing a search. Has a maximum value of 2147483648 (max int + 1).
+
ip
Ranges of ip addresses to accept requests from. This should be as restrictive as possible - just list the icats you need to @@ -63,6 +68,29 @@ take the form of an IPV4 or IPV6 address followed by the number of bits (starting from the most significant) to consider. For example 127.0.0.1/32 is the IPV4 value for localhost.
+ +
units
+
Recognised unit names/symbols. Each symbol recognised by indriya's + SimpleUnitFormat should be followed by a colon, and then a comma separated + list of units measuring the same property. If the unit is simply an alias + (e.g. "K: kelvin") this is sufficient. If a conversion is required, it + should be followed by this factor (e.g. "J: eV 1.602176634e-19"). Different + units can be separated by a semi-colon.
+ +
facetFields
+
The names of fields which should be stored as facetable. The names should + correspond to how the field appears in the Lucene index, which may be + different to how it is represented in the ICAT database due to flattening of + one to one relationships between entities. Accurate field names can be taken + from `getDoc` function(s) in icat.server. Note that in order to be available + at search time, the field must have been specified when indexing the + documents.
+ +
aggregateFiles
+
Aggregate file sizes/counts for Datasets and Investigations as Datafiles are + added or modified (i.e. in real time). This can have a significant + performance impact when writing to the index. If "false", icat.server can + instead be configured to update sizes at a regular intervals.
diff --git a/src/site/xhtml/release-notes.xhtml b/src/site/xhtml/release-notes.xhtml index 1b43c6b..145a2f9 100644 --- a/src/site/xhtml/release-notes.xhtml +++ b/src/site/xhtml/release-notes.xhtml @@ -6,6 +6,18 @@

ICAT Lucene Server Release Notes

+

3.0.0

+

Significant changes to the functionality and performance of searches:

+
    +
  • Ability to search on over 2 billion documents
  • +
  • Enable sorting on specific entity fields
  • +
  • "Infinitely" search the data by using the searchAfter parameter
  • +
  • Faceted searches
  • +
  • Replace single "text" field with specific fields that reflect the ICAT schema to allow field targeting
  • +
  • Support for unit conversion on numeric Parameters
  • +
  • Support for synonym injection
  • +
+

2.0.2

Fix compatibility with indexes built by icat.lucene 1.x

diff --git a/src/test/java/icat/lucene/TestLucene.java b/src/test/java/icat/lucene/TestLucene.java old mode 100644 new mode 100755 index f5cd493..cedd5b0 --- a/src/test/java/icat/lucene/TestLucene.java +++ b/src/test/java/icat/lucene/TestLucene.java @@ -9,13 +9,23 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; +import org.apache.lucene.facet.FacetResult; +import org.apache.lucene.facet.Facets; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; @@ -39,12 +49,15 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.icatproject.lucene.IcatAnalyzer; +import org.icatproject.lucene.IcatSynonymAnalyzer; import org.junit.Test; public class TestLucene { static final int scale = (int) 1.0e5; + private final FacetsConfig facetsConfig = new FacetsConfig(); + @Test public void testIcatAnalyzer() throws Exception { final String text = "This is a demo of the 1st (or is it number 2) all singing and dancing TokenStream's API with added aardvarks"; @@ -66,8 +79,66 @@ public void testIcatAnalyzer() throws Exception { } } - assertEquals(11, n); - assertEquals(" demo 1st number 2 all sing danc tokenstream api ad aardvark", newString); + assertEquals(12, n); + assertEquals(" demo of 1st number 2 all sing danc tokenstream api ad aardvark", newString); + } + + /** + * Test that IcatSynonymAnalyzer injects stems for alternate spellings and + * chemical symbols for the elements + */ + @Test + public void testIcatSynonymAnalyzer() throws Exception { + final String text = "hydrogen Helium LITHIUM be B NE ionisation TIME of FLIGHT technique ArPeS"; + int n = 0; + String newString = ""; + + try (Analyzer analyzer = new IcatSynonymAnalyzer()) { + TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); + CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); + try { + stream.reset(); // Curiously this is required + while (stream.incrementToken()) { + n++; + newString = newString + " " + termAtt; + } + stream.end(); + } finally { + stream.close(); + } + } + + assertEquals(24, n); + assertEquals(" h hydrogen he helium li lithium beryllium be boron b neon ne ioniz ionis tof time of flight techniqu arp angl resolv photoemiss spectroscopi", newString); + } + + /** + * Test that we do not stop words that are chemical symbols (As At Be In No) + * but otherwise filter out stop words + */ + @Test + public void testIcatAnalyzerStopWords() throws Exception { + final String text = "as at be in no that the their then there"; + int n = 0; + String newString = ""; + + try (Analyzer analyzer = new IcatAnalyzer()) { + TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); + CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); + try { + stream.reset(); // Curiously this is required + while (stream.incrementToken()) { + n++; + newString = newString + " " + termAtt; + } + stream.end(); + } finally { + stream.close(); + } + } + + assertEquals(5, n); + assertEquals(" as at be in no", newString); } @Test @@ -171,6 +242,57 @@ public void testJoins() throws Exception { System.out.println("Join tests took " + (System.currentTimeMillis() - start) + "ms"); } + @Test + public void testFacets() throws Exception { + Analyzer analyzer = new IcatAnalyzer(); + IndexWriterConfig config; + + Path tmpLuceneDir = Files.createTempDirectory("lucene"); + FSDirectory investigationDirectory = FSDirectory.open(tmpLuceneDir.resolve("Investigation")); + config = new IndexWriterConfig(analyzer); + config.setOpenMode(OpenMode.CREATE); + IndexWriter investigationWriter = new IndexWriter(investigationDirectory, config); + + // Add investigations with parameter and sample Facets + addFacetedInvestigation(investigationWriter, "inv1", 101, "parameter1", "sample1"); + addFacetedInvestigation(investigationWriter, "inv2", 102, "parameter2", "sample2"); + + // Add investigations with only the parameter Facet + for (int i = 0; i < scale; i++) { + addFacetedInvestigation(investigationWriter, "extra" + i, 500 + i, "parameter0"); + } + + investigationWriter.close(); + + DirectoryReader directoryReader = DirectoryReader.open(investigationDirectory); + IndexSearcher investigationSearcher = new IndexSearcher(directoryReader); + StandardQueryParser parser = new StandardQueryParser(); + StandardQueryConfigHandler qpConf = (StandardQueryConfigHandler) parser.getQueryConfigHandler(); + qpConf.set(ConfigurationKeys.ANALYZER, analyzer); + qpConf.set(ConfigurationKeys.ALLOW_LEADING_WILDCARD, true); + Map labelValuesParameter = new HashMap<>(); + Map labelValuesSample = new HashMap<>(); + + long start = System.currentTimeMillis(); + + // Get Facets that are relevant for "inv1" + labelValuesParameter.put("parameter1", 1); + labelValuesSample.put("sample1", 1); + checkFacets(labelValuesParameter, labelValuesSample, "inv1", investigationSearcher, directoryReader, parser); + + // Get Facets that are relevant for "inv*" + labelValuesParameter.put("parameter2", 1); + labelValuesSample.put("sample2", 1); + checkFacets(labelValuesParameter, labelValuesSample, "inv*", investigationSearcher, directoryReader, parser); + + // Get all Facets for "*" + labelValuesParameter.put("parameter0", scale); + checkFacets(labelValuesParameter, labelValuesSample, "*", investigationSearcher, directoryReader, parser); + + System.out.println("Facet tests took " + (System.currentTimeMillis() - start) + "ms"); + } + + private void checkDatafiles(List dnums, String fname, String uname, IndexSearcher investigationSearcher, IndexSearcher investigationUserSearcher, IndexSearcher datasetSearcher, IndexSearcher datafileSearcher, StandardQueryParser parser) throws IOException, QueryNodeException { @@ -253,6 +375,20 @@ private ScoreDoc[] get(String iname, String uname, IndexSearcher investigationSe } + /* Facets */ + private Facets get(String iname, IndexSearcher investigationSearcher, DirectoryReader directoryReader, + StandardQueryParser parser) throws QueryNodeException, IOException { + BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); + if (iname != null) { + theQuery.add(parser.parse(iname, "name"), Occur.MUST); + } + DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(directoryReader); + FacetsCollector facetsCollector = new FacetsCollector(); + FacetsCollector.search(investigationSearcher, theQuery.build(), 50, facetsCollector); + Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); + return facets; + } + private void checkDatasets(List dnums, String sname, String uname, IndexSearcher investigationSearcher, IndexSearcher investigationUserSearcher, IndexSearcher datasetSearcher, StandardQueryParser parser) throws IOException, QueryNodeException { @@ -265,6 +401,31 @@ private void checkDatasets(List dnums, String sname, String uname, Inde } + private void checkFacets(Map labelValuesParameter, Map labelValuesSample, + String iname, IndexSearcher investigationSearcher, DirectoryReader directoryReader, + StandardQueryParser parser) throws QueryNodeException, IOException { + Facets facets = get(iname, investigationSearcher, directoryReader, parser); + List results = facets.getAllDims(50); + if (labelValuesParameter.size() > 0) { + FacetResult parameterResult = results.remove(0); + assertEquals("Dimension", "parameter", parameterResult.dim); + assertEquals("Length", labelValuesParameter.size(), parameterResult.labelValues.length); + for (LabelAndValue labelValue : parameterResult.labelValues) { + assertTrue("Label", labelValuesParameter.containsKey(labelValue.label)); + assertEquals("Value", labelValuesParameter.get(labelValue.label), labelValue.value); + } + } + if (labelValuesSample.size() > 0) { + FacetResult sampleResult = results.remove(0); + assertEquals("Dimension", "sample", sampleResult.dim); + assertEquals("Length", labelValuesSample.size(), sampleResult.labelValues.length); + for (LabelAndValue labelValue : sampleResult.labelValues) { + assertTrue("Label", labelValuesSample.containsKey(labelValue.label)); + assertEquals("Value", labelValuesSample.get(labelValue.label), labelValue.value); + } + } + } + private void checkInvestigations(List dnums, String iname, String uname, IndexSearcher investigationSearcher, IndexSearcher investigationUserSearcher, StandardQueryParser parser) throws QueryNodeException, IOException { @@ -285,6 +446,27 @@ private void addInvestigation(IndexWriter iwriter, String name, long iNum) throw iwriter.addDocument(doc); } + private void addFacetedInvestigation(IndexWriter iwriter, String name, long iNum, String parameterValue, + String sampleValue) throws IOException { + Document doc = new Document(); + doc.add(new StringField("name", name, Store.NO)); + doc.add(new SortedDocValuesField("id", new BytesRef(Long.toString(iNum)))); + doc.add(new StringField("id", Long.toString(iNum), Store.YES)); + doc.add(new SortedSetDocValuesFacetField("parameter", parameterValue)); + doc.add(new SortedSetDocValuesFacetField("sample", sampleValue)); + iwriter.addDocument(facetsConfig.build(doc)); + } + + private void addFacetedInvestigation(IndexWriter iwriter, String name, long iNum, String parameterValue) + throws IOException { + Document doc = new Document(); + doc.add(new StringField("name", name, Store.NO)); + doc.add(new SortedDocValuesField("id", new BytesRef(Long.toString(iNum)))); + doc.add(new StringField("id", Long.toString(iNum), Store.YES)); + doc.add(new SortedSetDocValuesFacetField("parameter", parameterValue)); + iwriter.addDocument(facetsConfig.build(doc)); + } + private void addInvestigationUser(IndexWriter iwriter, String name, long iNum) throws IOException { Document doc = new Document(); doc.add(new StringField("name", name, Store.NO));