From 5606f21eb5b83afa0e20e0ea0b4ce2f49aaa3681 Mon Sep 17 00:00:00 2001 From: Jorrit Poelen Date: Mon, 29 Jul 2024 11:45:41 -0500 Subject: [PATCH] enable streaming of names; related to https://github.com/Big-Bee-Network/bif/issues/1 --- .../elton/cmd/CmdInteractions.java | 184 +----------------- .../elton/cmd/CmdNames.java | 65 +------ .../elton/cmd/CmdReview.java | 12 +- .../elton/cmd/CmdStream.java | 27 ++- .../elton/cmd/NodeFactorFactory.java | 8 + ...ler.java => StreamingDatasetsHandler.java} | 34 ++-- .../elton/cmd/WriterInteractionsTSV.java | 177 +++++++++++++++++ .../elton/cmd/WriterTaxonTSV.java | 47 +++++ .../elton/cmd/WriterUtil.java | 58 ++++++ .../elton/util/TaxonWriter.java | 2 - 10 files changed, 338 insertions(+), 276 deletions(-) create mode 100644 src/main/java/org/globalbioticinteractions/elton/cmd/NodeFactorFactory.java rename src/main/java/org/globalbioticinteractions/elton/cmd/{StreamingNamespaceConfigHandler.java => StreamingDatasetsHandler.java} (69%) create mode 100644 src/main/java/org/globalbioticinteractions/elton/cmd/WriterInteractionsTSV.java create mode 100644 src/main/java/org/globalbioticinteractions/elton/cmd/WriterTaxonTSV.java create mode 100644 src/main/java/org/globalbioticinteractions/elton/cmd/WriterUtil.java diff --git a/src/main/java/org/globalbioticinteractions/elton/cmd/CmdInteractions.java b/src/main/java/org/globalbioticinteractions/elton/cmd/CmdInteractions.java index 31d9862..685023d 100644 --- a/src/main/java/org/globalbioticinteractions/elton/cmd/CmdInteractions.java +++ b/src/main/java/org/globalbioticinteractions/elton/cmd/CmdInteractions.java @@ -1,80 +1,15 @@ package org.globalbioticinteractions.elton.cmd; -import org.apache.commons.lang.StringUtils; import org.eol.globi.data.ImportLogger; -import org.eol.globi.domain.InteractType; +import org.eol.globi.data.NodeFactory; import org.eol.globi.domain.LogContext; -import org.eol.globi.domain.PropertyAndValueDictionary; -import org.eol.globi.domain.Study; -import org.globalbioticinteractions.dataset.Dataset; import org.globalbioticinteractions.dataset.DatasetRegistry; -import org.globalbioticinteractions.elton.util.DatasetProcessorForTSV; import org.globalbioticinteractions.elton.util.DatasetRegistryUtil; -import org.globalbioticinteractions.elton.util.InteractionWriter; -import org.globalbioticinteractions.elton.util.NodeFactoryForDataset; -import org.globalbioticinteractions.elton.util.NodeFactoryNull; import org.globalbioticinteractions.elton.util.ProgressUtil; -import org.globalbioticinteractions.elton.util.SpecimenImpl; -import org.globalbioticinteractions.elton.util.StreamUtil; -import org.globalbioticinteractions.elton.util.TabularWriter; import picocli.CommandLine; import java.io.PrintStream; import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Stream; - -import static org.eol.globi.data.DatasetImporterForMetaTable.EVENT_DATE; -import static org.eol.globi.data.DatasetImporterForTSV.ARGUMENT_TYPE_ID; -import static org.eol.globi.data.DatasetImporterForTSV.BASIS_OF_RECORD_ID; -import static org.eol.globi.data.DatasetImporterForTSV.BASIS_OF_RECORD_NAME; -import static org.eol.globi.data.DatasetImporterForTSV.DECIMAL_LATITUDE; -import static org.eol.globi.data.DatasetImporterForTSV.DECIMAL_LONGITUDE; -import static org.eol.globi.data.DatasetImporterForTSV.INTERACTION_TYPE_ID; -import static org.eol.globi.data.DatasetImporterForTSV.INTERACTION_TYPE_NAME; -import static org.eol.globi.data.DatasetImporterForTSV.LOCALITY_ID; -import static org.eol.globi.data.DatasetImporterForTSV.LOCALITY_NAME; -import static org.eol.globi.data.DatasetImporterForTSV.REFERENCE_CITATION; -import static org.eol.globi.data.DatasetImporterForTSV.REFERENCE_DOI; -import static org.eol.globi.data.DatasetImporterForTSV.REFERENCE_URL; -import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_BODY_PART_ID; -import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_BODY_PART_NAME; -import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_CATALOG_NUMBER; -import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_COLLECTION_CODE; -import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_COLLECTION_ID; -import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_INSTITUTION_CODE; -import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_LIFE_STAGE_ID; -import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_LIFE_STAGE_NAME; -import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_OCCURRENCE_ID; -import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_SEX_ID; -import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_SEX_NAME; -import static org.eol.globi.data.DatasetImporterForTSV.TARGET_BODY_PART_ID; -import static org.eol.globi.data.DatasetImporterForTSV.TARGET_BODY_PART_NAME; -import static org.eol.globi.data.DatasetImporterForTSV.TARGET_CATALOG_NUMBER; -import static org.eol.globi.data.DatasetImporterForTSV.TARGET_COLLECTION_CODE; -import static org.eol.globi.data.DatasetImporterForTSV.TARGET_COLLECTION_ID; -import static org.eol.globi.data.DatasetImporterForTSV.TARGET_INSTITUTION_CODE; -import static org.eol.globi.data.DatasetImporterForTSV.TARGET_LIFE_STAGE_ID; -import static org.eol.globi.data.DatasetImporterForTSV.TARGET_LIFE_STAGE_NAME; -import static org.eol.globi.data.DatasetImporterForTSV.TARGET_OCCURRENCE_ID; -import static org.eol.globi.data.DatasetImporterForTSV.TARGET_SEX_ID; -import static org.eol.globi.data.DatasetImporterForTSV.TARGET_SEX_NAME; -import static org.eol.globi.domain.PropertyAndValueDictionary.CATALOG_NUMBER; -import static org.eol.globi.domain.PropertyAndValueDictionary.COLLECTION_CODE; -import static org.eol.globi.domain.PropertyAndValueDictionary.COLLECTION_ID; -import static org.eol.globi.domain.PropertyAndValueDictionary.INSTITUTION_CODE; -import static org.eol.globi.domain.PropertyAndValueDictionary.OCCURRENCE_ID; -import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_ID; -import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_NAME; -import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_PATH; -import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_PATH_IDS; -import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_PATH_NAMES; -import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_RANK; -import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_ID; -import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_NAME; -import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_PATH; -import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_PATH_IDS; -import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_PATH_NAMES; -import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_RANK; @CommandLine.Command( name = "interactions", @@ -83,133 +18,20 @@ ) public class CmdInteractions extends CmdTabularWriterParams { - public static class TsvWriter implements InteractionWriter, TabularWriter { - private final PrintStream out; - - TsvWriter(PrintStream out) { - this.out = out; - } - - @Override - public void write(SpecimenImpl source, InteractType type, SpecimenImpl target, Study study, Dataset dataset) { - Stream valueStream = getValues(source, type, target, study, dataset); - String row = StreamUtil.tsvRowOf(valueStream); - out.println(row); - } - - private static Stream getValues(SpecimenImpl source, InteractType type, SpecimenImpl target, Study study, Dataset dataset) { - Stream interactStream = Stream.of(type.getIRI(), type.getLabel()); - - String sourceOccurrenceId = valueOrEmpty(source, OCCURRENCE_ID); - String sourceCatalogNumber = valueOrEmpty(source, CATALOG_NUMBER); - String sourceCollectionCode = valueOrEmpty(source, COLLECTION_CODE); - String sourceCollectionId = valueOrEmpty(source, COLLECTION_ID); - String sourceInstitutionCode = valueOrEmpty(source, INSTITUTION_CODE); - - String targetOccurrenceId = valueOrEmpty(target, OCCURRENCE_ID); - String targetCatalogNumber = valueOrEmpty(target, CATALOG_NUMBER); - String targetCollectionCode = valueOrEmpty(target, COLLECTION_CODE); - String targetCollectionId = valueOrEmpty(target, COLLECTION_ID); - String targetInstitutionCode = valueOrEmpty(target, INSTITUTION_CODE); - - return Stream.of( - Stream.of(source.isSupportingClaim() ? PropertyAndValueDictionary.SUPPORTS : PropertyAndValueDictionary.REFUTES), - Stream.of(sourceOccurrenceId, sourceCatalogNumber, sourceCollectionCode, sourceCollectionId, sourceInstitutionCode), - StreamUtil.streamOf(source.taxon), - StreamUtil.streamOf(source), - interactStream, - Stream.of(targetOccurrenceId, targetCatalogNumber, targetCollectionCode, targetCollectionId, targetInstitutionCode), - StreamUtil.streamOf(target.taxon), - StreamUtil.streamOf(target), - StreamUtil.streamOf(target.getBasisOfRecord()), - StreamUtil.streamOf(target.getEventDate()), - StreamUtil.streamOf(target.getSampleLocation()), - StreamUtil.streamOf(study), - CmdUtil.datasetInfo(dataset).stream()).flatMap(x -> x); - } - - private static String valueOrEmpty(SpecimenImpl source, String key) { - String value = source.getProperty(key); - return StringUtils.isBlank(value) ? "" : value; - } - - @Override - public void writeHeader() { - Stream keys = getKeys(); - out.println(StreamUtil.tsvRowOf(keys)); - } - - private static Stream getKeys() { - return Stream.concat(Stream.of( - ARGUMENT_TYPE_ID, - SOURCE_OCCURRENCE_ID, - SOURCE_CATALOG_NUMBER, - SOURCE_COLLECTION_CODE, - SOURCE_COLLECTION_ID, - SOURCE_INSTITUTION_CODE, - SOURCE_TAXON_ID, - SOURCE_TAXON_NAME, - SOURCE_TAXON_RANK, - SOURCE_TAXON_PATH_IDS, - SOURCE_TAXON_PATH, - SOURCE_TAXON_PATH_NAMES, - SOURCE_BODY_PART_ID, - SOURCE_BODY_PART_NAME, - SOURCE_LIFE_STAGE_ID, - SOURCE_LIFE_STAGE_NAME, - SOURCE_SEX_ID, - SOURCE_SEX_NAME, - INTERACTION_TYPE_ID, - INTERACTION_TYPE_NAME, - TARGET_OCCURRENCE_ID, - TARGET_CATALOG_NUMBER, - TARGET_COLLECTION_CODE, - TARGET_COLLECTION_ID, - TARGET_INSTITUTION_CODE, - TARGET_TAXON_ID, - TARGET_TAXON_NAME, - TARGET_TAXON_RANK, - TARGET_TAXON_PATH_IDS, - TARGET_TAXON_PATH, - TARGET_TAXON_PATH_NAMES, - TARGET_BODY_PART_ID, - TARGET_BODY_PART_NAME, - TARGET_LIFE_STAGE_ID, - TARGET_LIFE_STAGE_NAME, - TARGET_SEX_ID, - TARGET_SEX_NAME, - BASIS_OF_RECORD_ID, - BASIS_OF_RECORD_NAME, - EVENT_DATE, - DECIMAL_LATITUDE, - DECIMAL_LONGITUDE, - LOCALITY_ID, - LOCALITY_NAME, - REFERENCE_DOI, - REFERENCE_URL, - REFERENCE_CITATION - - ), StreamUtil.datasetHeaderFields()); - } - } - @Override public void run() { run(System.out); } void run(PrintStream out) { - TsvWriter writer = new TsvWriter(out); - if (!shouldSkipHeader()) { - writer.writeHeader(); - } DatasetRegistry registry = DatasetRegistryUtil.forCacheDirOrLocalDir( getCacheDir(), getWorkDir(), createInputStreamFactory()); - NodeFactoryNull nodeFactory = new NodeFactoryForDataset(writer, new DatasetProcessorForTSV()); + NodeFactory nodeFactory = WriterUtil.nodeFactoryForWritingInteractions(!shouldSkipHeader(), out); + CmdUtil.handleNamespaces( registry, nodeFactory, diff --git a/src/main/java/org/globalbioticinteractions/elton/cmd/CmdNames.java b/src/main/java/org/globalbioticinteractions/elton/cmd/CmdNames.java index c8e2172..0333a47 100644 --- a/src/main/java/org/globalbioticinteractions/elton/cmd/CmdNames.java +++ b/src/main/java/org/globalbioticinteractions/elton/cmd/CmdNames.java @@ -1,22 +1,12 @@ package org.globalbioticinteractions.elton.cmd; import org.eol.globi.data.NodeFactory; -import org.eol.globi.data.NodeFactoryException; -import org.eol.globi.domain.Interaction; -import org.eol.globi.domain.Specimen; -import org.eol.globi.domain.Study; -import org.eol.globi.domain.Taxon; import org.eol.globi.tool.NullImportLogger; -import org.globalbioticinteractions.dataset.Dataset; import org.globalbioticinteractions.dataset.DatasetRegistry; import org.globalbioticinteractions.elton.util.DatasetRegistryUtil; -import org.globalbioticinteractions.elton.util.NodeFactoryNull; -import org.globalbioticinteractions.elton.util.StreamUtil; -import org.globalbioticinteractions.elton.util.TaxonWriter; import picocli.CommandLine; import java.io.PrintStream; -import java.util.stream.Stream; @CommandLine.Command( name = "names", @@ -31,63 +21,12 @@ public void run() { } void run(PrintStream out) { - TaxonWriter writer = createWriter(out); - if (!shouldSkipHeader()) { - writer.writeHeader(); - } DatasetRegistry registry = DatasetRegistryUtil.forCacheDirOrLocalDir(getCacheDir(), getWorkDir(), createInputStreamFactory()); - NodeFactory nodeFactory = createFactory(writer); - CmdUtil.handleNamespaces(registry, nodeFactory, getNamespaces(), "listing taxa", getStderr(), new NullImportLogger()); - } - - private TaxonWriter createWriter(PrintStream out) { - return new TaxonWriter() { - - @Override - public void write(Taxon taxon, Dataset dataset) { - Stream rowStream = Stream.concat(StreamUtil.streamOf(taxon), StreamUtil.streamOf(dataset)); - String row = StreamUtil.tsvRowOf(rowStream); - out.println(row); - } - - @Override - public void writeHeader() { - out.println(StreamUtil.tsvRowOf( - Stream.concat(Stream.of( - "taxonId", - "taxonName", - "taxonRank", - "taxonPathIds", - "taxonPath", - "taxonPathNames"), - StreamUtil.datasetHeaderFields()))); - } - }; - } - - private NodeFactoryNull createFactory(TaxonWriter writer) { - return new NodeFactoryNull() { - Dataset dataset; - - @Override - public Dataset getOrCreateDataset(Dataset dataset) { - this.dataset = dataset; - return super.getOrCreateDataset(dataset); - } - @Override - public Specimen createSpecimen(Interaction interaction, Taxon taxon) throws NodeFactoryException { - writer.write(taxon, dataset); - return super.createSpecimen(interaction, taxon); - } + NodeFactory nodeFactory = WriterUtil.nodeFactoryForTaxonWriting(!shouldSkipHeader(), out); - - @Override - public Specimen createSpecimen(Study study, Taxon taxon) throws NodeFactoryException { - return super.createSpecimen(study, taxon); - } - }; + CmdUtil.handleNamespaces(registry, nodeFactory, getNamespaces(), "listing taxa", getStderr(), new NullImportLogger()); } } diff --git a/src/main/java/org/globalbioticinteractions/elton/cmd/CmdReview.java b/src/main/java/org/globalbioticinteractions/elton/cmd/CmdReview.java index e5e37ae..7229d63 100644 --- a/src/main/java/org/globalbioticinteractions/elton/cmd/CmdReview.java +++ b/src/main/java/org/globalbioticinteractions/elton/cmd/CmdReview.java @@ -25,7 +25,6 @@ import org.eol.globi.util.InputStreamFactory; import org.eol.globi.util.ResourceServiceLocal; import org.eol.globi.util.ResourceServiceLocalAndRemote; -import org.eol.globi.util.ResourceServiceRemote; import org.globalbioticinteractions.dataset.CitationUtil; import org.globalbioticinteractions.dataset.Dataset; import org.globalbioticinteractions.dataset.DatasetConstant; @@ -133,14 +132,9 @@ private void reviewCachedOrRemote(List namespaces, InputStreamFactory in private void review(String repoName, DatasetRegistry registry, InputStreamFactory inputStreamFactory) throws StudyImporterException { final AtomicLong noteCounter = new AtomicLong(0); final AtomicLong infoCounter = new AtomicLong(0); - - ParserFactoryLocal parserFactory = new ParserFactoryLocal(getClass()); AtomicInteger interactionCounter = new AtomicInteger(0); - ReviewReportLogger reviewReportLogger = createReviewReportLogger(repoName, noteCounter, infoCounter); - NodeFactoryLogging nodeFactory = new NodeFactoryLogging(interactionCounter, reviewReportLogger); - DatasetImporterForRegistry studyImporter = new DatasetImporterForRegistry(parserFactory, nodeFactory, registry); - studyImporter.setLogger(reviewReportLogger); + ReviewReportLogger reviewReportLogger = createReviewReportLogger(repoName, noteCounter, infoCounter); try { Dataset dataset = new DatasetFactory( @@ -153,12 +147,16 @@ private void review(String repoName, DatasetRegistry registry, InputStreamFactor && StringUtils.endsWith(citationString, ">")) { reviewReportLogger.warn(null, "no citation found for dataset at [" + dataset.getArchiveURI() + "]"); } + NodeFactoryLogging nodeFactory = new NodeFactoryLogging(interactionCounter, reviewReportLogger); nodeFactory.getOrCreateDataset(dataset); getStderr().print("creating review [" + repoName + "]... "); if (!shouldSkipHeader()) { logHeader(getStdout()); } + ParserFactoryLocal parserFactory = new ParserFactoryLocal(getClass()); + DatasetImporterForRegistry studyImporter = new DatasetImporterForRegistry(parserFactory, nodeFactory, registry); + studyImporter.setLogger(reviewReportLogger); DatasetImportUtil.importDataset( null, dataset, diff --git a/src/main/java/org/globalbioticinteractions/elton/cmd/CmdStream.java b/src/main/java/org/globalbioticinteractions/elton/cmd/CmdStream.java index ef72e04..a76c695 100644 --- a/src/main/java/org/globalbioticinteractions/elton/cmd/CmdStream.java +++ b/src/main/java/org/globalbioticinteractions/elton/cmd/CmdStream.java @@ -6,6 +6,7 @@ import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; +import org.eol.globi.data.NodeFactory; import org.globalbioticinteractions.elton.util.DatasetRegistryUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -29,6 +30,10 @@ public class CmdStream extends CmdDefaultParams { private final static Logger LOG = LoggerFactory.getLogger(CmdStream.class); + @CommandLine.Option(names = {"--record-type"}, + description = "record types (e.g., interaction, name, review)" + ) + private String recordType = "interaction"; @Override public void run() { @@ -43,14 +48,14 @@ public void run() { String namespace = jsonNode.at("/namespace").asText(DatasetRegistryUtil.NAMESPACE_LOCAL); if (StringUtils.isNotBlank(namespace)) { try { - StreamingNamespaceConfigHandler namespaceHandler = new StreamingNamespaceConfigHandler( + boolean shouldWriteHeader = isFirst.get(); + StreamingDatasetsHandler namespaceHandler = new StreamingDatasetsHandler( jsonNode, this.createInputStreamFactory(), this.getCacheDir(), this.getStderr(), - this.getStdout() + new NodeFactorFactoryImpl(shouldWriteHeader) ); - namespaceHandler.setShouldWriteHeader(isFirst.get()); namespaceHandler.onNamespace(namespace); isFirst.set(false); } catch (Exception e) { @@ -69,4 +74,20 @@ public void run() { } + public class NodeFactorFactoryImpl implements NodeFactorFactory { + + private final boolean shouldWriteHeader; + + public NodeFactorFactoryImpl(boolean shouldWriteHeader) { + this.shouldWriteHeader = shouldWriteHeader; + } + + @Override + public NodeFactory createNodeFactory() { + + return StringUtils.equals("name", recordType) + ? WriterUtil.nodeFactoryForWritingInteractions(shouldWriteHeader, getStdout()) + : WriterUtil.nodeFactoryForTaxonWriting(shouldWriteHeader, getStdout()); + } + } } diff --git a/src/main/java/org/globalbioticinteractions/elton/cmd/NodeFactorFactory.java b/src/main/java/org/globalbioticinteractions/elton/cmd/NodeFactorFactory.java new file mode 100644 index 0000000..da26639 --- /dev/null +++ b/src/main/java/org/globalbioticinteractions/elton/cmd/NodeFactorFactory.java @@ -0,0 +1,8 @@ +package org.globalbioticinteractions.elton.cmd; + +import org.eol.globi.data.NodeFactory; + +public interface NodeFactorFactory { + + NodeFactory createNodeFactory(); +} diff --git a/src/main/java/org/globalbioticinteractions/elton/cmd/StreamingNamespaceConfigHandler.java b/src/main/java/org/globalbioticinteractions/elton/cmd/StreamingDatasetsHandler.java similarity index 69% rename from src/main/java/org/globalbioticinteractions/elton/cmd/StreamingNamespaceConfigHandler.java rename to src/main/java/org/globalbioticinteractions/elton/cmd/StreamingDatasetsHandler.java index 229342f..a188145 100644 --- a/src/main/java/org/globalbioticinteractions/elton/cmd/StreamingNamespaceConfigHandler.java +++ b/src/main/java/org/globalbioticinteractions/elton/cmd/StreamingDatasetsHandler.java @@ -11,9 +11,7 @@ import org.globalbioticinteractions.dataset.Dataset; import org.globalbioticinteractions.dataset.DatasetWithCache; import org.globalbioticinteractions.dataset.DatasetWithResourceMapping; -import org.globalbioticinteractions.elton.util.DatasetProcessorForTSV; import org.globalbioticinteractions.elton.util.NamespaceHandler; -import org.globalbioticinteractions.elton.util.NodeFactoryForDataset; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,26 +19,26 @@ import java.net.URI; import java.nio.charset.StandardCharsets; -class StreamingNamespaceConfigHandler implements NamespaceHandler { - private final static Logger LOG = LoggerFactory.getLogger(StreamingNamespaceConfigHandler.class); +class StreamingDatasetsHandler implements NamespaceHandler { + private final static Logger LOG = LoggerFactory.getLogger(StreamingDatasetsHandler.class); private final String cacheDir; private final PrintStream stderr; - private final PrintStream stdout; private InputStreamFactory factory; private final JsonNode config; - private boolean shouldWriteHeader; + private NodeFactorFactory nodeFactorFactory; - public StreamingNamespaceConfigHandler(JsonNode jsonNode, - InputStreamFactoryLogging inputStreamFactory, - String cacheDir, - PrintStream stderr, - PrintStream stdout) { + public StreamingDatasetsHandler(JsonNode jsonNode, + InputStreamFactory inputStreamFactory, + String cacheDir, + PrintStream stderr, + NodeFactorFactory nodeFactorFactory) { this.factory = inputStreamFactory; this.cacheDir = cacheDir; this.stderr = stderr; - this.stdout = stdout; this.config = jsonNode; + this.nodeFactorFactory = nodeFactorFactory; + } @Override @@ -61,19 +59,16 @@ public void onNamespace(String namespace) throws Exception { Cache cache = cacheFactory.cacheFor(dataset); DatasetWithCache datasetWithCache = new DatasetWithCache(dataset, cache); - CmdInteractions.TsvWriter writer = new CmdInteractions.TsvWriter(stdout); - if (shouldWriteHeader) { - writer.writeHeader(); - } + NodeFactorFactory nodeFactorFactory = this.nodeFactorFactory; - NodeFactory factory = new NodeFactoryForDataset(writer, new DatasetProcessorForTSV()); - factory.getOrCreateDataset(dataset); + NodeFactory nodeFactory = nodeFactorFactory.createNodeFactory(); + nodeFactory.getOrCreateDataset(dataset); try { DatasetImportUtil.importDataset( null, datasetWithCache, - factory, + nodeFactory, null); stderr.println("done."); } catch (StudyImporterException ex) { @@ -86,6 +81,5 @@ public void onNamespace(String namespace) throws Exception { } public void setShouldWriteHeader(boolean shouldWriteHeader) { - this.shouldWriteHeader = shouldWriteHeader; } } diff --git a/src/main/java/org/globalbioticinteractions/elton/cmd/WriterInteractionsTSV.java b/src/main/java/org/globalbioticinteractions/elton/cmd/WriterInteractionsTSV.java new file mode 100644 index 0000000..c2cb70f --- /dev/null +++ b/src/main/java/org/globalbioticinteractions/elton/cmd/WriterInteractionsTSV.java @@ -0,0 +1,177 @@ +package org.globalbioticinteractions.elton.cmd; + +import org.apache.commons.lang.StringUtils; +import org.eol.globi.domain.InteractType; +import org.eol.globi.domain.PropertyAndValueDictionary; +import org.eol.globi.domain.Study; +import org.globalbioticinteractions.dataset.Dataset; +import org.globalbioticinteractions.elton.util.InteractionWriter; +import org.globalbioticinteractions.elton.util.SpecimenImpl; +import org.globalbioticinteractions.elton.util.StreamUtil; +import org.globalbioticinteractions.elton.util.TabularWriter; + +import java.io.PrintStream; +import java.util.stream.Stream; + +import static org.eol.globi.data.DatasetImporterForMetaTable.EVENT_DATE; +import static org.eol.globi.data.DatasetImporterForTSV.ARGUMENT_TYPE_ID; +import static org.eol.globi.data.DatasetImporterForTSV.BASIS_OF_RECORD_ID; +import static org.eol.globi.data.DatasetImporterForTSV.BASIS_OF_RECORD_NAME; +import static org.eol.globi.data.DatasetImporterForTSV.DECIMAL_LATITUDE; +import static org.eol.globi.data.DatasetImporterForTSV.DECIMAL_LONGITUDE; +import static org.eol.globi.data.DatasetImporterForTSV.INTERACTION_TYPE_ID; +import static org.eol.globi.data.DatasetImporterForTSV.INTERACTION_TYPE_NAME; +import static org.eol.globi.data.DatasetImporterForTSV.LOCALITY_ID; +import static org.eol.globi.data.DatasetImporterForTSV.LOCALITY_NAME; +import static org.eol.globi.data.DatasetImporterForTSV.REFERENCE_CITATION; +import static org.eol.globi.data.DatasetImporterForTSV.REFERENCE_DOI; +import static org.eol.globi.data.DatasetImporterForTSV.REFERENCE_URL; +import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_BODY_PART_ID; +import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_BODY_PART_NAME; +import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_CATALOG_NUMBER; +import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_COLLECTION_CODE; +import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_COLLECTION_ID; +import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_INSTITUTION_CODE; +import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_LIFE_STAGE_ID; +import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_LIFE_STAGE_NAME; +import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_OCCURRENCE_ID; +import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_SEX_ID; +import static org.eol.globi.data.DatasetImporterForTSV.SOURCE_SEX_NAME; +import static org.eol.globi.data.DatasetImporterForTSV.TARGET_BODY_PART_ID; +import static org.eol.globi.data.DatasetImporterForTSV.TARGET_BODY_PART_NAME; +import static org.eol.globi.data.DatasetImporterForTSV.TARGET_CATALOG_NUMBER; +import static org.eol.globi.data.DatasetImporterForTSV.TARGET_COLLECTION_CODE; +import static org.eol.globi.data.DatasetImporterForTSV.TARGET_COLLECTION_ID; +import static org.eol.globi.data.DatasetImporterForTSV.TARGET_INSTITUTION_CODE; +import static org.eol.globi.data.DatasetImporterForTSV.TARGET_LIFE_STAGE_ID; +import static org.eol.globi.data.DatasetImporterForTSV.TARGET_LIFE_STAGE_NAME; +import static org.eol.globi.data.DatasetImporterForTSV.TARGET_OCCURRENCE_ID; +import static org.eol.globi.data.DatasetImporterForTSV.TARGET_SEX_ID; +import static org.eol.globi.data.DatasetImporterForTSV.TARGET_SEX_NAME; +import static org.eol.globi.domain.PropertyAndValueDictionary.CATALOG_NUMBER; +import static org.eol.globi.domain.PropertyAndValueDictionary.COLLECTION_CODE; +import static org.eol.globi.domain.PropertyAndValueDictionary.COLLECTION_ID; +import static org.eol.globi.domain.PropertyAndValueDictionary.INSTITUTION_CODE; +import static org.eol.globi.domain.PropertyAndValueDictionary.OCCURRENCE_ID; +import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_ID; +import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_NAME; +import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_PATH; +import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_PATH_IDS; +import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_PATH_NAMES; +import static org.eol.globi.service.TaxonUtil.SOURCE_TAXON_RANK; +import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_ID; +import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_NAME; +import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_PATH; +import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_PATH_IDS; +import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_PATH_NAMES; +import static org.eol.globi.service.TaxonUtil.TARGET_TAXON_RANK; + +public class WriterInteractionsTSV implements InteractionWriter, TabularWriter { + private final PrintStream out; + + WriterInteractionsTSV(PrintStream out) { + this.out = out; + } + + @Override + public void write(SpecimenImpl source, InteractType type, SpecimenImpl target, Study study, Dataset dataset) { + Stream valueStream = getValues(source, type, target, study, dataset); + String row = StreamUtil.tsvRowOf(valueStream); + out.println(row); + } + + private static Stream getValues(SpecimenImpl source, InteractType type, SpecimenImpl target, Study study, Dataset dataset) { + Stream interactStream = Stream.of(type.getIRI(), type.getLabel()); + + String sourceOccurrenceId = valueOrEmpty(source, OCCURRENCE_ID); + String sourceCatalogNumber = valueOrEmpty(source, CATALOG_NUMBER); + String sourceCollectionCode = valueOrEmpty(source, COLLECTION_CODE); + String sourceCollectionId = valueOrEmpty(source, COLLECTION_ID); + String sourceInstitutionCode = valueOrEmpty(source, INSTITUTION_CODE); + + String targetOccurrenceId = valueOrEmpty(target, OCCURRENCE_ID); + String targetCatalogNumber = valueOrEmpty(target, CATALOG_NUMBER); + String targetCollectionCode = valueOrEmpty(target, COLLECTION_CODE); + String targetCollectionId = valueOrEmpty(target, COLLECTION_ID); + String targetInstitutionCode = valueOrEmpty(target, INSTITUTION_CODE); + + return Stream.of( + Stream.of(source.isSupportingClaim() ? PropertyAndValueDictionary.SUPPORTS : PropertyAndValueDictionary.REFUTES), + Stream.of(sourceOccurrenceId, sourceCatalogNumber, sourceCollectionCode, sourceCollectionId, sourceInstitutionCode), + StreamUtil.streamOf(source.taxon), + StreamUtil.streamOf(source), + interactStream, + Stream.of(targetOccurrenceId, targetCatalogNumber, targetCollectionCode, targetCollectionId, targetInstitutionCode), + StreamUtil.streamOf(target.taxon), + StreamUtil.streamOf(target), + StreamUtil.streamOf(target.getBasisOfRecord()), + StreamUtil.streamOf(target.getEventDate()), + StreamUtil.streamOf(target.getSampleLocation()), + StreamUtil.streamOf(study), + CmdUtil.datasetInfo(dataset).stream()).flatMap(x -> x); + } + + private static String valueOrEmpty(SpecimenImpl source, String key) { + String value = source.getProperty(key); + return StringUtils.isBlank(value) ? "" : value; + } + + @Override + public void writeHeader() { + Stream keys = getKeys(); + out.println(StreamUtil.tsvRowOf(keys)); + } + + private static Stream getKeys() { + return Stream.concat(Stream.of( + ARGUMENT_TYPE_ID, + SOURCE_OCCURRENCE_ID, + SOURCE_CATALOG_NUMBER, + SOURCE_COLLECTION_CODE, + SOURCE_COLLECTION_ID, + SOURCE_INSTITUTION_CODE, + SOURCE_TAXON_ID, + SOURCE_TAXON_NAME, + SOURCE_TAXON_RANK, + SOURCE_TAXON_PATH_IDS, + SOURCE_TAXON_PATH, + SOURCE_TAXON_PATH_NAMES, + SOURCE_BODY_PART_ID, + SOURCE_BODY_PART_NAME, + SOURCE_LIFE_STAGE_ID, + SOURCE_LIFE_STAGE_NAME, + SOURCE_SEX_ID, + SOURCE_SEX_NAME, + INTERACTION_TYPE_ID, + INTERACTION_TYPE_NAME, + TARGET_OCCURRENCE_ID, + TARGET_CATALOG_NUMBER, + TARGET_COLLECTION_CODE, + TARGET_COLLECTION_ID, + TARGET_INSTITUTION_CODE, + TARGET_TAXON_ID, + TARGET_TAXON_NAME, + TARGET_TAXON_RANK, + TARGET_TAXON_PATH_IDS, + TARGET_TAXON_PATH, + TARGET_TAXON_PATH_NAMES, + TARGET_BODY_PART_ID, + TARGET_BODY_PART_NAME, + TARGET_LIFE_STAGE_ID, + TARGET_LIFE_STAGE_NAME, + TARGET_SEX_ID, + TARGET_SEX_NAME, + BASIS_OF_RECORD_ID, + BASIS_OF_RECORD_NAME, + EVENT_DATE, + DECIMAL_LATITUDE, + DECIMAL_LONGITUDE, + LOCALITY_ID, + LOCALITY_NAME, + REFERENCE_DOI, + REFERENCE_URL, + REFERENCE_CITATION + + ), StreamUtil.datasetHeaderFields()); + } +} diff --git a/src/main/java/org/globalbioticinteractions/elton/cmd/WriterTaxonTSV.java b/src/main/java/org/globalbioticinteractions/elton/cmd/WriterTaxonTSV.java new file mode 100644 index 0000000..e947a7a --- /dev/null +++ b/src/main/java/org/globalbioticinteractions/elton/cmd/WriterTaxonTSV.java @@ -0,0 +1,47 @@ +package org.globalbioticinteractions.elton.cmd; + +import org.eol.globi.domain.InteractType; +import org.eol.globi.domain.Study; +import org.eol.globi.domain.Taxon; +import org.globalbioticinteractions.dataset.Dataset; +import org.globalbioticinteractions.elton.util.InteractionWriter; +import org.globalbioticinteractions.elton.util.SpecimenImpl; +import org.globalbioticinteractions.elton.util.StreamUtil; +import org.globalbioticinteractions.elton.util.TaxonWriter; + +import java.io.PrintStream; +import java.util.stream.Stream; + +public class WriterTaxonTSV implements TaxonWriter, InteractionWriter { + + private final PrintStream out; + + public WriterTaxonTSV(PrintStream out) { + this.out = out; + } + + @Override + public void write(Taxon taxon, Dataset dataset) { + Stream rowStream = Stream.concat(StreamUtil.streamOf(taxon), StreamUtil.streamOf(dataset)); + String row = StreamUtil.tsvRowOf(rowStream); + out.println(row); + } + + @Override + public void writeHeader() { + out.println(StreamUtil.tsvRowOf( + Stream.concat(Stream.of( + "taxonId", + "taxonName", + "taxonRank", + "taxonPathIds", + "taxonPath", + "taxonPathNames"), + StreamUtil.datasetHeaderFields()))); + } + + @Override + public void write(SpecimenImpl source, InteractType type, SpecimenImpl target, Study study, Dataset dataset) { + + } +} diff --git a/src/main/java/org/globalbioticinteractions/elton/cmd/WriterUtil.java b/src/main/java/org/globalbioticinteractions/elton/cmd/WriterUtil.java new file mode 100644 index 0000000..3f25306 --- /dev/null +++ b/src/main/java/org/globalbioticinteractions/elton/cmd/WriterUtil.java @@ -0,0 +1,58 @@ +package org.globalbioticinteractions.elton.cmd; + +import org.eol.globi.data.NodeFactory; +import org.eol.globi.data.NodeFactoryException; +import org.eol.globi.domain.Interaction; +import org.eol.globi.domain.Specimen; +import org.eol.globi.domain.Study; +import org.eol.globi.domain.Taxon; +import org.globalbioticinteractions.dataset.Dataset; +import org.globalbioticinteractions.elton.util.DatasetProcessorForTSV; +import org.globalbioticinteractions.elton.util.NodeFactoryForDataset; +import org.globalbioticinteractions.elton.util.NodeFactoryNull; +import org.globalbioticinteractions.elton.util.TaxonWriter; + +import java.io.PrintStream; + +public class WriterUtil { + static NodeFactory nodeFactoryForWritingInteractions(boolean shouldWriteHeader, PrintStream stdout) { + WriterInteractionsTSV writer = new WriterInteractionsTSV(stdout); + if (shouldWriteHeader) { + writer.writeHeader(); + } + + return new NodeFactoryForDataset(writer, new DatasetProcessorForTSV()); + } + + static NodeFactory nodeFactoryForTaxonWriting(boolean shouldWriteHeader, PrintStream out) { + TaxonWriter writer = new WriterTaxonTSV(out); + if (shouldWriteHeader) { + writer.writeHeader(); + } + return createFactory(writer); + } + + private static NodeFactoryNull createFactory(TaxonWriter writer) { + return new NodeFactoryNull() { + Dataset dataset; + + @Override + public Dataset getOrCreateDataset(Dataset dataset) { + this.dataset = dataset; + return super.getOrCreateDataset(dataset); + } + + @Override + public Specimen createSpecimen(Interaction interaction, Taxon taxon) throws NodeFactoryException { + writer.write(taxon, dataset); + return super.createSpecimen(interaction, taxon); + } + + + @Override + public Specimen createSpecimen(Study study, Taxon taxon) throws NodeFactoryException { + return super.createSpecimen(study, taxon); + } + }; + } +} diff --git a/src/main/java/org/globalbioticinteractions/elton/util/TaxonWriter.java b/src/main/java/org/globalbioticinteractions/elton/util/TaxonWriter.java index 7deffed..c5bdf65 100644 --- a/src/main/java/org/globalbioticinteractions/elton/util/TaxonWriter.java +++ b/src/main/java/org/globalbioticinteractions/elton/util/TaxonWriter.java @@ -1,7 +1,5 @@ package org.globalbioticinteractions.elton.util; -import org.eol.globi.domain.InteractType; -import org.eol.globi.domain.Study; import org.eol.globi.domain.Taxon; import org.globalbioticinteractions.dataset.Dataset;