Skip to content

Commit

Permalink
Merge pull request #167 from aodn/manual-org-vocabs-mapping
Browse files Browse the repository at this point in the history
Manual org vocabs mapping
  • Loading branch information
utas-raymondng authored Nov 28, 2024
2 parents 48889c1 + 00d4559 commit 15482dd
Show file tree
Hide file tree
Showing 38 changed files with 404 additions and 304 deletions.
5 changes: 5 additions & 0 deletions indexer/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
import org.springframework.web.client.RestTemplate;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.concurrent.atomic.AtomicReference;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package au.org.aodn.esindexer.service;

import au.org.aodn.ardcvocabs.model.VocabModel;
import au.org.aodn.esindexer.configuration.AppConstants;
import au.org.aodn.esindexer.exception.*;
import au.org.aodn.esindexer.model.DatasetProvider;
Expand Down Expand Up @@ -47,6 +48,8 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static au.org.aodn.esindexer.utils.CommonUtils.safeGet;

@Slf4j
@Service
@Scope(proxyMode = ScopedProxyMode.TARGET_CLASS)
Expand Down Expand Up @@ -184,12 +187,18 @@ protected StacCollectionModel getMappedMetadataValues(String metadataValues) thr
stacCollectionModel.getSummaries().setScore(score);

// parameter vocabs
List<String> mappedParameterVocabsFromGcmdKeywords = gcmdKeywordUtils.getMappedParameterVocabsFromGcmdKeywords(stacCollectionModel.getThemes());
List<String> processedParameterVocabs = vocabService.extractVocabLabelsFromThemes(stacCollectionModel.getThemes(), AppConstants.AODN_DISCOVERY_PARAMETER_VOCABS);
Set<String> mappedParameterLabels = new HashSet<>();
List<String> processedParameterVocabs = vocabService.extractVocabLabelsFromThemes(
stacCollectionModel.getThemes(), AppConstants.AODN_DISCOVERY_PARAMETER_VOCABS
);

if (!processedParameterVocabs.isEmpty()) {
stacCollectionModel.getSummaries().setParameterVocabs(Stream.concat(mappedParameterVocabsFromGcmdKeywords.stream(), processedParameterVocabs.stream()).distinct().collect(Collectors.toList()));
mappedParameterLabels.addAll(processedParameterVocabs);
} else {
// manual mapping with custom logic when the record doesn't have existing AODN Parameter Vocabs
mappedParameterLabels.addAll(gcmdKeywordUtils.getMappedParameterVocabsFromGcmdKeywords(stacCollectionModel.getThemes()));
}
stacCollectionModel.getSummaries().setParameterVocabs(new ArrayList<>(mappedParameterLabels));

/*
NOTE: The following implementation for platform and organization vocabularies is just a placeholder, not the final version.
Expand All @@ -204,7 +213,18 @@ protected StacCollectionModel getMappedMetadataValues(String metadataValues) thr
}

// organisation vocabs
// TODO: the logics for mapping record's organisation vocabs are heavily customised for a manual approach, AI now or later? need dedicated service's method
Set<String> mappedOrganisationLabels = new HashSet<>();
List<String> organisationLabelsFromThemes = vocabService.extractOrganisationVocabLabelsFromThemes(stacCollectionModel.getThemes());
if (!organisationLabelsFromThemes.isEmpty()) {
mappedOrganisationLabels.addAll(organisationLabelsFromThemes);
} else {
// manual mapping with custom logics when the record doesn't have existing AODN Organisation Vocabs
List<VocabModel> mappedOrganisationVocabsFromContacts = vocabService.getMappedOrganisationVocabsFromContacts(stacCollectionModel.getContacts());
for (VocabModel vocabModel : mappedOrganisationVocabsFromContacts) {
mappedOrganisationLabels.addAll(extractOrderedLabels(vocabModel));
}
}
stacCollectionModel.getSummaries().setOrganisationVocabs(new ArrayList<>(mappedOrganisationLabels));

// search_as_you_type enabled fields can be extended
SearchSuggestionsModel searchSuggestionsModel = SearchSuggestionsModel.builder()
Expand All @@ -216,6 +236,19 @@ protected StacCollectionModel getMappedMetadataValues(String metadataValues) thr

return stacCollectionModel;
}

private List<String> extractOrderedLabels(VocabModel vocabModel) {
// Priority: DisplayLabel > AltLabels > PrefLabel
if (safeGet(vocabModel::getDisplayLabel).isPresent()) {
return List.of(vocabModel.getDisplayLabel());
} else if (safeGet(vocabModel::getAltLabels).isPresent()) {
return vocabModel.getAltLabels();
} else if (safeGet(vocabModel::getLabel).isPresent()) {
return List.of(vocabModel.getLabel());
}
return List.of();
}

/**
* Use to index a particular UUID, the async is used to limit the number of same function call to avoid flooding
* the system.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import java.time.*;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
package au.org.aodn.esindexer.service;

import au.org.aodn.ardcvocabs.model.VocabModel;
import au.org.aodn.stac.model.ContactsModel;
import au.org.aodn.stac.model.ThemesModel;
import com.fasterxml.jackson.databind.JsonNode;

import java.io.IOException;
import java.util.List;
import java.util.concurrent.ExecutionException;

public interface VocabService {
List<String> extractVocabLabelsFromThemes(List<ThemesModel> themes, String vocabType) throws IOException;

List<String> extractOrganisationVocabLabelsFromThemes(List<ThemesModel> themes) throws IOException;
List<VocabModel> getMappedOrganisationVocabsFromContacts(List<ContactsModel> contacts) throws IOException;
void populateVocabsData() throws IOException;
void populateVocabsDataAsync();
void clearParameterVocabCache();
void clearPlatformVocabCache();
void clearOrganisationVocabCache();

List<JsonNode> getParameterVocabs() throws IOException;
List<JsonNode> getPlatformVocabs() throws IOException;
List<JsonNode> getOrganisationVocabs() throws IOException;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import au.org.aodn.esindexer.configuration.AppConstants;
import au.org.aodn.esindexer.exception.DocumentNotFoundException;
import au.org.aodn.stac.model.ConceptModel;
import au.org.aodn.stac.model.ContactsModel;
import au.org.aodn.stac.model.ThemesModel;
import co.elastic.clients.elasticsearch.ElasticsearchClient;
import co.elastic.clients.elasticsearch._types.ElasticsearchException;
Expand All @@ -33,6 +34,8 @@
import java.io.IOException;
import java.util.*;

import static au.org.aodn.esindexer.utils.CommonUtils.safeGet;

@Slf4j
@Service
// create and inject a stub proxy to self due to the circular reference http://bit.ly/4aFvYtt
Expand Down Expand Up @@ -139,6 +142,105 @@ public List<String> extractVocabLabelsFromThemes(List<ThemesModel> themes, Strin
return results;
}

public List<String> extractOrganisationVocabLabelsFromThemes(List<ThemesModel> themes) {
List<String> results = new ArrayList<>();
themes.stream()
.filter(Objects::nonNull)
.forEach(theme -> safeGet(theme::getTitle)
.filter(title -> title.toLowerCase().contains("aodn organisation vocabulary"))
.ifPresent(title -> theme.getConcepts().stream()
.filter(concept -> concept.getId() != null && !concept.getId().isEmpty())
.forEach(concept -> results.add(concept.getId()))
));
return results;
}

public List<VocabModel> getMappedOrganisationVocabsFromContacts(List<ContactsModel> contacts) throws IOException {
List<String> contactOrgs = new ArrayList<>();
String citationRole = "citation";
String pointOfContactRole = "pointOfContact";

// Top priority to citation: cit:citedResponsibleParty>
contacts.stream()
.filter(contact -> safeGet(contact::getRoles)
.filter(roles -> roles.contains(citationRole))
.isPresent())
.map(ContactsModel::getOrganization)
.filter(Objects::nonNull)
.forEach(contactOrgs::add);

// Second priority if contactOrgs is still empty
if (contactOrgs.isEmpty()) {
contacts.stream()
.filter(contact -> safeGet(contact::getRoles)
.filter(roles -> roles.contains(pointOfContactRole))
.isPresent())
.map(ContactsModel::getOrganization)
.filter(Objects::nonNull)
.forEach(contactOrgs::add);
}

List<VocabModel> results = new ArrayList<>();
for (JsonNode orgVocab : self.getOrganisationVocabs()) {
if (orgVocab != null) {
try {
VocabModel vocabModel = indexerObjectMapper.treeToValue(orgVocab, VocabModel.class);
dfsSearch(vocabModel, contactOrgs, results);
} catch (JsonProcessingException e) {
log.error("Error deserializing JsonNode to VocabModel", e);
}
}
}

return results;
}

/**
* Performs a Depth-First Search (DFS) to find vocab matches.
* DFS is well-suited for hierarchical structures due to its memory efficiency
* and ability to capture matches at any depth while allowing early exits within branches.
*
* @param currentVocab the current vocab node being processed
* @param contactOrgs the list of organisation names to match against
* @param results the list to store matching vocab nodes
*/
private void dfsSearch(VocabModel currentVocab, List<String> contactOrgs, List<VocabModel> results) {
// Skip vocabs that have replaced_by field non-null
if (currentVocab.getReplacedBy() != null) return;

// Check labels in priority order and add to results if a match is found
if (findAndAddMatch(Collections.singletonList(currentVocab.getDisplayLabel()), contactOrgs) ||
findAndAddMatch(currentVocab.getAltLabels(), contactOrgs) ||
findAndAddMatch(Collections.singletonList(currentVocab.getLabel()), contactOrgs) ||
findAndAddMatch(currentVocab.getHiddenLabels(), contactOrgs)) {
log.info("Match found: {}", currentVocab);
results.add(currentVocab);
return;
}

// Recursively search narrower nodes
List<VocabModel> narrowerNodes = currentVocab.getNarrower();
if (narrowerNodes != null) {
for (VocabModel narrowerNode : narrowerNodes) {
dfsSearch(narrowerNode, contactOrgs, results);
}
}
}

private boolean findAndAddMatch(List<String> labels, List<String> contactOrgs) {
if (labels == null || labels.isEmpty()) return false;
for (String label : labels) {
if (label != null) {
for (String contactOrg : contactOrgs) {
if (label.equalsIgnoreCase(contactOrg)) {
return true;
}
}
}
}
return false;
}

protected List<JsonNode> groupVocabsFromEsByKey(String key) throws IOException {
List<JsonNode> vocabs = new ArrayList<>();
log.info("Fetching {} vocabularies from {}", key, vocabsIndexName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import au.org.aodn.stac.model.ConceptModel;
import au.org.aodn.stac.model.ThemesModel;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springframework.stereotype.Component;
Expand Down Expand Up @@ -52,23 +55,20 @@ private static String readResourceFile(String path) throws IOException {
// Load the CSV file into a HashMap
private void loadCsvToMap(String path) {
try {

log.info("Loading GCMD mapping contents from CSV resource: {}", path);

// Read the file as a single String
String fileContent = readResourceFile(path);

// Split the content into lines
String[] lines = fileContent.split("\\r?\\n");

// Process each line
for (String line : lines) {
// Split the line into key and value based on comma
String[] parts = line.split(",");
if (parts.length >= 2) {
String key = parts[0].trim();
String value = parts[1].trim();
gcmdMapping.put(key, value);
// Read the file content using Apache Commons CSV
Resource resource = new ClassPathResource(path);
try (InputStream inputStream = resource.getInputStream();
Reader reader = new InputStreamReader(inputStream);
CSVParser csvParser = new CSVParser(reader, CSVFormat.DEFAULT)) {

for (CSVRecord record : csvParser) {
if (record.size() >= 2) { // Ensure at least key-value pairs exist
String key = record.get(0).trim();
String value = record.get(1).trim();
gcmdMapping.put(key, value);
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Profile;
import org.springframework.scheduling.annotation.Scheduled;

import java.io.IOException;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import au.org.aodn.esindexer.utils.AssociatedRecordsUtil;
import au.org.aodn.esindexer.model.RelationType;
import au.org.aodn.esindexer.utils.JaxbUtils;
import au.org.aodn.esindexer.utils.StringUtil;
import au.org.aodn.metadata.iso19115_3_2018.MDMetadataType;
import co.elastic.clients.elasticsearch.ElasticsearchClient;
import co.elastic.clients.elasticsearch.core.SearchRequest;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
package au.org.aodn.esindexer.utils;

import au.org.aodn.esindexer.utils.StringUtil;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

import java.nio.charset.StandardCharsets;
import java.util.List;

public class StringUtilTest {
@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
"temporal" : [ {
"start" : null,
"end" : null
} ]
} ],
"parameter_vocabs" : [ ],
"organisation_vocabs" : [ ]
},
"contacts" : [ {
"roles" : [ "pointOfContact", "metadata" ],
Expand Down
4 changes: 3 additions & 1 deletion indexer/src/test/resources/canned/associated/self.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@
},
"temporal" : [ {
"start" : "2007-04-03T06:00:00Z"
} ]
} ],
"parameter_vocabs" : [ "ocean biota", "conductivity", "temperature" ],
"organisation_vocabs" : [ "Integrated Marine Observing System (IMOS)" ]
},
"contacts" : [ {
"roles" : [ "pointOfContact", "about" ],
Expand Down
4 changes: 3 additions & 1 deletion indexer/src/test/resources/canned/keywords_null_stac.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@
"temporal" : [ {
"start" : "2018-09-13T14:00:00Z",
"end" : null
} ]
} ],
"parameter_vocabs" : [ ],
"organisation_vocabs" : [ ]
},
"contacts" : [ {
"roles" : [ "resourceProvider", "about" ],
Expand Down
4 changes: 3 additions & 1 deletion indexer/src/test/resources/canned/sample10_stac.json
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,9 @@
"temporal" : [ {
"start" : "2007-09-08T14:00:00Z",
"end" : null
} ]
} ],
"parameter_vocabs" : [ ],
"organisation_vocabs" : [ ]
},
"contacts" : [ {
"roles" : [ "pointOfContact", "about" ],
Expand Down
4 changes: 3 additions & 1 deletion indexer/src/test/resources/canned/sample11_stac.json
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@
"temporal" : [ {
"start" : "2008-09-28T14:00:00Z",
"end" : null
} ]
} ],
"parameter_vocabs" : [ ],
"organisation_vocabs" : [ ]
},
"contacts" : [ {
"roles" : [ "pointOfContact", "about" ],
Expand Down
4 changes: 3 additions & 1 deletion indexer/src/test/resources/canned/sample12_stac.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@
"temporal" : [ {
"start" : "2012-04-15T14:00:00Z",
"end" : "2012-04-30T13:59:59Z"
} ]
} ],
"parameter_vocabs" : [ ],
"organisation_vocabs" : [ ]
},
"contacts" : [ {
"roles" : [ "custodian", "about" ],
Expand Down
Loading

0 comments on commit 15482dd

Please sign in to comment.