Merge pull request #167 from aodn/manual-org-vocabs-mapping

Manual org vocabs mapping
aodn · Nov 28, 2024 · 15482dd · 15482dd
2 parents 48889c1 + 00d4559
commit 15482dd
Show file tree

Hide file tree

Showing 38 changed files with 404 additions and 304 deletions.
diff --git a/indexer/pom.xml b/indexer/pom.xml
@@ -169,6 +169,11 @@
             <artifactId>junit</artifactId>
             <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-csv</artifactId>
+            <version>1.10.0</version>
+        </dependency>
         <dependency>
             <groupId>org.springframework.boot</groupId>
             <artifactId>spring-boot-starter-test</artifactId>

diff --git a/indexer/src/main/java/au/org/aodn/esindexer/service/GeoNetworkServiceImpl.java b/indexer/src/main/java/au/org/aodn/esindexer/service/GeoNetworkServiceImpl.java
@@ -36,7 +36,6 @@
 import org.springframework.web.client.RestTemplate;
 
 import java.io.IOException;
-import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicReference;
 

diff --git a/indexer/src/main/java/au/org/aodn/esindexer/service/IndexerServiceImpl.java b/indexer/src/main/java/au/org/aodn/esindexer/service/IndexerServiceImpl.java
@@ -1,5 +1,6 @@
 package au.org.aodn.esindexer.service;
 
+import au.org.aodn.ardcvocabs.model.VocabModel;
 import au.org.aodn.esindexer.configuration.AppConstants;
 import au.org.aodn.esindexer.exception.*;
 import au.org.aodn.esindexer.model.DatasetProvider;
@@ -47,6 +48,8 @@
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
+import static au.org.aodn.esindexer.utils.CommonUtils.safeGet;
+
 @Slf4j
 @Service
 @Scope(proxyMode = ScopedProxyMode.TARGET_CLASS)
@@ -184,12 +187,18 @@ protected StacCollectionModel getMappedMetadataValues(String metadataValues) thr
         stacCollectionModel.getSummaries().setScore(score);
 
         // parameter vocabs
-        List<String> mappedParameterVocabsFromGcmdKeywords = gcmdKeywordUtils.getMappedParameterVocabsFromGcmdKeywords(stacCollectionModel.getThemes());
-        List<String> processedParameterVocabs = vocabService.extractVocabLabelsFromThemes(stacCollectionModel.getThemes(), AppConstants.AODN_DISCOVERY_PARAMETER_VOCABS);
+        Set<String> mappedParameterLabels = new HashSet<>();
+        List<String> processedParameterVocabs = vocabService.extractVocabLabelsFromThemes(
+                stacCollectionModel.getThemes(), AppConstants.AODN_DISCOVERY_PARAMETER_VOCABS
+        );
 
         if (!processedParameterVocabs.isEmpty()) {
-            stacCollectionModel.getSummaries().setParameterVocabs(Stream.concat(mappedParameterVocabsFromGcmdKeywords.stream(), processedParameterVocabs.stream()).distinct().collect(Collectors.toList()));
+            mappedParameterLabels.addAll(processedParameterVocabs);
+        } else {
+            // manual mapping with custom logic when the record doesn't have existing AODN Parameter Vocabs
+            mappedParameterLabels.addAll(gcmdKeywordUtils.getMappedParameterVocabsFromGcmdKeywords(stacCollectionModel.getThemes()));
         }
+        stacCollectionModel.getSummaries().setParameterVocabs(new ArrayList<>(mappedParameterLabels));
 
         /*
         NOTE: The following implementation for platform and organization vocabularies is just a placeholder, not the final version.
@@ -204,7 +213,18 @@ protected StacCollectionModel getMappedMetadataValues(String metadataValues) thr
         }
 
         // organisation vocabs
-        // TODO: the logics for mapping record's organisation vocabs are heavily customised for a manual approach, AI now or later? need dedicated service's method
+        Set<String> mappedOrganisationLabels = new HashSet<>();
+        List<String> organisationLabelsFromThemes = vocabService.extractOrganisationVocabLabelsFromThemes(stacCollectionModel.getThemes());
+        if (!organisationLabelsFromThemes.isEmpty()) {
+            mappedOrganisationLabels.addAll(organisationLabelsFromThemes);
+        } else {
+            // manual mapping with custom logics when the record doesn't have existing AODN Organisation Vocabs
+            List<VocabModel> mappedOrganisationVocabsFromContacts = vocabService.getMappedOrganisationVocabsFromContacts(stacCollectionModel.getContacts());
+            for (VocabModel vocabModel : mappedOrganisationVocabsFromContacts) {
+                mappedOrganisationLabels.addAll(extractOrderedLabels(vocabModel));
+            }
+        }
+        stacCollectionModel.getSummaries().setOrganisationVocabs(new ArrayList<>(mappedOrganisationLabels));
 
         // search_as_you_type enabled fields can be extended
         SearchSuggestionsModel searchSuggestionsModel = SearchSuggestionsModel.builder()
@@ -216,6 +236,19 @@ protected StacCollectionModel getMappedMetadataValues(String metadataValues) thr
 
         return stacCollectionModel;
     }
+
+    private List<String> extractOrderedLabels(VocabModel vocabModel) {
+        // Priority: DisplayLabel > AltLabels > PrefLabel
+        if (safeGet(vocabModel::getDisplayLabel).isPresent()) {
+            return List.of(vocabModel.getDisplayLabel());
+        } else if (safeGet(vocabModel::getAltLabels).isPresent()) {
+            return vocabModel.getAltLabels();
+        } else if (safeGet(vocabModel::getLabel).isPresent()) {
+            return List.of(vocabModel.getLabel());
+        }
+        return List.of();
+    }
+
     /**
      * Use to index a particular UUID, the async is used to limit the number of same function call to avoid flooding
      * the system.

diff --git a/indexer/src/main/java/au/org/aodn/esindexer/service/StacCollectionMapperService.java b/indexer/src/main/java/au/org/aodn/esindexer/service/StacCollectionMapperService.java
@@ -22,7 +22,6 @@
 import java.time.*;
 import java.time.format.DateTimeFormatter;
 import java.util.*;
-import java.util.function.Function;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;

diff --git a/indexer/src/main/java/au/org/aodn/esindexer/service/VocabService.java b/indexer/src/main/java/au/org/aodn/esindexer/service/VocabService.java
@@ -1,21 +1,22 @@
 package au.org.aodn.esindexer.service;
 
+import au.org.aodn.ardcvocabs.model.VocabModel;
+import au.org.aodn.stac.model.ContactsModel;
 import au.org.aodn.stac.model.ThemesModel;
 import com.fasterxml.jackson.databind.JsonNode;
 
 import java.io.IOException;
 import java.util.List;
-import java.util.concurrent.ExecutionException;
 
 public interface VocabService {
     List<String> extractVocabLabelsFromThemes(List<ThemesModel> themes, String vocabType) throws IOException;
-
+    List<String> extractOrganisationVocabLabelsFromThemes(List<ThemesModel> themes) throws IOException;
+    List<VocabModel> getMappedOrganisationVocabsFromContacts(List<ContactsModel> contacts) throws IOException;
     void populateVocabsData() throws IOException;
     void populateVocabsDataAsync();
     void clearParameterVocabCache();
     void clearPlatformVocabCache();
     void clearOrganisationVocabCache();
-
     List<JsonNode> getParameterVocabs() throws IOException;
     List<JsonNode> getPlatformVocabs() throws IOException;
     List<JsonNode> getOrganisationVocabs() throws IOException;

diff --git a/indexer/src/main/java/au/org/aodn/esindexer/service/VocabServiceImpl.java b/indexer/src/main/java/au/org/aodn/esindexer/service/VocabServiceImpl.java
@@ -7,6 +7,7 @@
 import au.org.aodn.esindexer.configuration.AppConstants;
 import au.org.aodn.esindexer.exception.DocumentNotFoundException;
 import au.org.aodn.stac.model.ConceptModel;
+import au.org.aodn.stac.model.ContactsModel;
 import au.org.aodn.stac.model.ThemesModel;
 import co.elastic.clients.elasticsearch.ElasticsearchClient;
 import co.elastic.clients.elasticsearch._types.ElasticsearchException;
@@ -33,6 +34,8 @@
 import java.io.IOException;
 import java.util.*;
 
+import static au.org.aodn.esindexer.utils.CommonUtils.safeGet;
+
 @Slf4j
 @Service
 // create and inject a stub proxy to self due to the circular reference http://bit.ly/4aFvYtt
@@ -139,6 +142,105 @@ public List<String> extractVocabLabelsFromThemes(List<ThemesModel> themes, Strin
         return results;
     }
 
+    public List<String> extractOrganisationVocabLabelsFromThemes(List<ThemesModel> themes) {
+        List<String> results = new ArrayList<>();
+        themes.stream()
+                .filter(Objects::nonNull)
+                .forEach(theme -> safeGet(theme::getTitle)
+                        .filter(title -> title.toLowerCase().contains("aodn organisation vocabulary"))
+                        .ifPresent(title -> theme.getConcepts().stream()
+                                .filter(concept -> concept.getId() != null && !concept.getId().isEmpty())
+                                .forEach(concept -> results.add(concept.getId()))
+                        ));
+        return results;
+    }
+
+    public List<VocabModel> getMappedOrganisationVocabsFromContacts(List<ContactsModel> contacts) throws IOException {
+        List<String> contactOrgs = new ArrayList<>();
+        String citationRole = "citation";
+        String pointOfContactRole = "pointOfContact";
+
+        // Top priority to citation: cit:citedResponsibleParty>
+        contacts.stream()
+            .filter(contact -> safeGet(contact::getRoles)
+                .filter(roles -> roles.contains(citationRole))
+                .isPresent())
+            .map(ContactsModel::getOrganization)
+            .filter(Objects::nonNull)
+            .forEach(contactOrgs::add);
+
+        // Second priority if contactOrgs is still empty
+        if (contactOrgs.isEmpty()) {
+            contacts.stream()
+                .filter(contact -> safeGet(contact::getRoles)
+                    .filter(roles -> roles.contains(pointOfContactRole))
+                    .isPresent())
+                .map(ContactsModel::getOrganization)
+                .filter(Objects::nonNull)
+                .forEach(contactOrgs::add);
+        }
+
+        List<VocabModel> results = new ArrayList<>();
+        for (JsonNode orgVocab : self.getOrganisationVocabs()) {
+            if (orgVocab != null) {
+                try {
+                    VocabModel vocabModel = indexerObjectMapper.treeToValue(orgVocab, VocabModel.class);
+                    dfsSearch(vocabModel, contactOrgs, results);
+                } catch (JsonProcessingException e) {
+                    log.error("Error deserializing JsonNode to VocabModel", e);
+                }
+            }
+        }
+
+        return results;
+    }
+
+    /**
+     * Performs a Depth-First Search (DFS) to find vocab matches.
+     * DFS is well-suited for hierarchical structures due to its memory efficiency
+     * and ability to capture matches at any depth while allowing early exits within branches.
+     *
+     * @param currentVocab the current vocab node being processed
+     * @param contactOrgs  the list of organisation names to match against
+     * @param results      the list to store matching vocab nodes
+     */
+    private void dfsSearch(VocabModel currentVocab, List<String> contactOrgs, List<VocabModel> results) {
+        // Skip vocabs that have replaced_by field non-null
+        if (currentVocab.getReplacedBy() != null) return;
+
+        // Check labels in priority order and add to results if a match is found
+        if (findAndAddMatch(Collections.singletonList(currentVocab.getDisplayLabel()), contactOrgs) ||
+                findAndAddMatch(currentVocab.getAltLabels(), contactOrgs) ||
+                findAndAddMatch(Collections.singletonList(currentVocab.getLabel()), contactOrgs) ||
+                findAndAddMatch(currentVocab.getHiddenLabels(), contactOrgs)) {
+            log.info("Match found: {}", currentVocab);
+            results.add(currentVocab);
+            return;
+        }
+
+        // Recursively search narrower nodes
+        List<VocabModel> narrowerNodes = currentVocab.getNarrower();
+        if (narrowerNodes != null) {
+            for (VocabModel narrowerNode : narrowerNodes) {
+                dfsSearch(narrowerNode, contactOrgs, results);
+            }
+        }
+    }
+
+    private boolean findAndAddMatch(List<String> labels, List<String> contactOrgs) {
+        if (labels == null || labels.isEmpty()) return false;
+        for (String label : labels) {
+            if (label != null) {
+                for (String contactOrg : contactOrgs) {
+                    if (label.equalsIgnoreCase(contactOrg)) {
+                        return true;
+                    }
+                }
+            }
+        }
+        return false;
+    }
+
     protected List<JsonNode> groupVocabsFromEsByKey(String key) throws IOException {
         List<JsonNode> vocabs = new ArrayList<>();
         log.info("Fetching {} vocabularies from {}", key, vocabsIndexName);

diff --git a/indexer/src/main/java/au/org/aodn/esindexer/utils/GcmdKeywordUtils.java b/indexer/src/main/java/au/org/aodn/esindexer/utils/GcmdKeywordUtils.java
@@ -3,6 +3,9 @@
 import au.org.aodn.stac.model.ConceptModel;
 import au.org.aodn.stac.model.ThemesModel;
 import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
 import org.springframework.core.io.ClassPathResource;
 import org.springframework.core.io.Resource;
 import org.springframework.stereotype.Component;
@@ -52,23 +55,20 @@ private static String readResourceFile(String path) throws IOException {
     // Load the CSV file into a HashMap
     private void loadCsvToMap(String path) {
         try {
-
             log.info("Loading GCMD mapping contents from CSV resource: {}", path);
 
-            // Read the file as a single String
-            String fileContent = readResourceFile(path);
-
-            // Split the content into lines
-            String[] lines = fileContent.split("\\r?\\n");
-
-            // Process each line
-            for (String line : lines) {
-                // Split the line into key and value based on comma
-                String[] parts = line.split(",");
-                if (parts.length >= 2) {
-                    String key = parts[0].trim();
-                    String value = parts[1].trim();
-                    gcmdMapping.put(key, value);
+            // Read the file content using Apache Commons CSV
+            Resource resource = new ClassPathResource(path);
+            try (InputStream inputStream = resource.getInputStream();
+                 Reader reader = new InputStreamReader(inputStream);
+                 CSVParser csvParser = new CSVParser(reader, CSVFormat.DEFAULT)) {
+
+                for (CSVRecord record : csvParser) {
+                    if (record.size() >= 2) { // Ensure at least key-value pairs exist
+                        String key = record.get(0).trim();
+                        String value = record.get(1).trim();
+                        gcmdMapping.put(key, value);
+                    }
                 }
             }
 

diff --git a/indexer/src/main/java/au/org/aodn/esindexer/utils/VocabsIndexUtils.java b/indexer/src/main/java/au/org/aodn/esindexer/utils/VocabsIndexUtils.java
@@ -5,7 +5,6 @@
 import lombok.extern.slf4j.Slf4j;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
-import org.springframework.context.annotation.Profile;
 import org.springframework.scheduling.annotation.Scheduled;
 
 import java.io.IOException;

diff --git a/indexer/src/test/java/au/org/aodn/esindexer/service/GeoNetworkServiceIT.java b/indexer/src/test/java/au/org/aodn/esindexer/service/GeoNetworkServiceIT.java
@@ -8,7 +8,6 @@
 import au.org.aodn.esindexer.utils.AssociatedRecordsUtil;
 import au.org.aodn.esindexer.model.RelationType;
 import au.org.aodn.esindexer.utils.JaxbUtils;
-import au.org.aodn.esindexer.utils.StringUtil;
 import au.org.aodn.metadata.iso19115_3_2018.MDMetadataType;
 import co.elastic.clients.elasticsearch.ElasticsearchClient;
 import co.elastic.clients.elasticsearch.core.SearchRequest;

diff --git a/indexer/src/test/java/au/org/aodn/esindexer/utils/StringUtilTest.java b/indexer/src/test/java/au/org/aodn/esindexer/utils/StringUtilTest.java
@@ -1,12 +1,10 @@
 package au.org.aodn.esindexer.utils;
 
-import au.org.aodn.esindexer.utils.StringUtil;
 import org.junit.jupiter.api.Test;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNull;
 
 import java.nio.charset.StandardCharsets;
-import java.util.List;
 
 public class StringUtilTest {
     @Test

diff --git a/indexer/src/test/resources/canned/abstract_resposibilty_null_stac.json b/indexer/src/test/resources/canned/abstract_resposibilty_null_stac.json
@@ -29,7 +29,9 @@
     "temporal" : [ {
       "start" : null,
       "end" : null
-    } ]
+    } ],
+    "parameter_vocabs" : [ ],
+    "organisation_vocabs" : [ ]
   },
   "contacts" : [ {
     "roles" : [ "pointOfContact", "metadata" ],

diff --git a/indexer/src/test/resources/canned/associated/self.json b/indexer/src/test/resources/canned/associated/self.json
@@ -41,7 +41,9 @@
     },
     "temporal" : [ {
       "start" : "2007-04-03T06:00:00Z"
-    } ]
+    } ],
+    "parameter_vocabs" : [ "ocean biota", "conductivity", "temperature" ],
+    "organisation_vocabs" : [ "Integrated Marine Observing System (IMOS)" ]
   },
   "contacts" : [ {
     "roles" : [ "pointOfContact", "about" ],

diff --git a/indexer/src/test/resources/canned/keywords_null_stac.json b/indexer/src/test/resources/canned/keywords_null_stac.json
@@ -32,7 +32,9 @@
     "temporal" : [ {
       "start" : "2018-09-13T14:00:00Z",
       "end" : null
-    } ]
+    } ],
+    "parameter_vocabs" : [ ],
+    "organisation_vocabs" : [ ]
   },
   "contacts" : [ {
     "roles" : [ "resourceProvider", "about" ],

diff --git a/indexer/src/test/resources/canned/sample10_stac.json b/indexer/src/test/resources/canned/sample10_stac.json
@@ -172,7 +172,9 @@
     "temporal" : [ {
       "start" : "2007-09-08T14:00:00Z",
       "end" : null
-    } ]
+    } ],
+    "parameter_vocabs" : [ ],
+    "organisation_vocabs" : [ ]
   },
   "contacts" : [ {
     "roles" : [ "pointOfContact", "about" ],

diff --git a/indexer/src/test/resources/canned/sample11_stac.json b/indexer/src/test/resources/canned/sample11_stac.json
@@ -78,7 +78,9 @@
     "temporal" : [ {
       "start" : "2008-09-28T14:00:00Z",
       "end" : null
-    } ]
+    } ],
+    "parameter_vocabs" : [ ],
+    "organisation_vocabs" : [ ]
   },
   "contacts" : [ {
     "roles" : [ "pointOfContact", "about" ],

diff --git a/indexer/src/test/resources/canned/sample12_stac.json b/indexer/src/test/resources/canned/sample12_stac.json
@@ -33,7 +33,9 @@
     "temporal" : [ {
       "start" : "2012-04-15T14:00:00Z",
       "end" : "2012-04-30T13:59:59Z"
-    } ]
+    } ],
+    "parameter_vocabs" : [ ],
+    "organisation_vocabs" : [ ]
   },
   "contacts" : [ {
     "roles" : [ "custodian", "about" ],