From 3fb4cbbfc9c720534e50fbce8e1418fbc5c7037b Mon Sep 17 00:00:00 2001 From: inanyan Date: Wed, 10 Jul 2024 10:53:37 +0300 Subject: [PATCH] Improve the PR --- CHANGELOG.md | 1 + src/main/java/org/jabref/gui/JabRefGUI.java | 3 +- .../ai/embeddings/AiIngestedFilesTracker.java | 2 +- .../logic/ai/embeddings/AiIngestor.java | 31 ++++++++++++++----- .../embeddings/EmbeddingsGenerationTask.java | 23 +++++++++----- .../ai/embeddings/MVStoreEmbeddingStore.java | 5 +++ 6 files changed, 47 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b6c5291bcb..c5ea4e2b866 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv ### Added +- We added an AI chat for linked files. - We added support for offline extracting references from PDFs following the IEEE format. [#11156](https://github.com/JabRef/jabref/pull/11156) - We added a new keyboard shortcut ctrl + , to open the preferences. [#11154](https://github.com/JabRef/jabref/pull/11154) - We added value selection (such as for month) for content selectors in custom entry types. [#11109](https://github.com/JabRef/jabref/issues/11109) diff --git a/src/main/java/org/jabref/gui/JabRefGUI.java b/src/main/java/org/jabref/gui/JabRefGUI.java index 4eaa30ac824..52dc6858f85 100644 --- a/src/main/java/org/jabref/gui/JabRefGUI.java +++ b/src/main/java/org/jabref/gui/JabRefGUI.java @@ -1,5 +1,6 @@ package org.jabref.gui; +import java.util.Arrays; import java.util.List; import java.util.Optional; @@ -315,10 +316,10 @@ public void startBackgroundTasks() { @Override public void stop() { + aiService.close(); OOBibBaseConnect.closeOfficeConnection(); stopBackgroundTasks(); shutdownThreadPools(); - aiService.close(); } public void stopBackgroundTasks() { diff --git a/src/main/java/org/jabref/logic/ai/embeddings/AiIngestedFilesTracker.java b/src/main/java/org/jabref/logic/ai/embeddings/AiIngestedFilesTracker.java index 00156b5010b..84fab8444f7 100644 --- a/src/main/java/org/jabref/logic/ai/embeddings/AiIngestedFilesTracker.java +++ b/src/main/java/org/jabref/logic/ai/embeddings/AiIngestedFilesTracker.java @@ -53,7 +53,7 @@ public void endIngestingFile(String link, long modificationTimeInSeconds) { } public Optional getIngestedFileModificationTime(String link) { - return Optional.of(ingestedMap.get(link)); + return Optional.ofNullable(ingestedMap.get(link)); } public void registerListener(Object listener) { diff --git a/src/main/java/org/jabref/logic/ai/embeddings/AiIngestor.java b/src/main/java/org/jabref/logic/ai/embeddings/AiIngestor.java index d651f6b4be9..05c14bdc0e4 100644 --- a/src/main/java/org/jabref/logic/ai/embeddings/AiIngestor.java +++ b/src/main/java/org/jabref/logic/ai/embeddings/AiIngestor.java @@ -8,6 +8,8 @@ import java.util.Optional; import java.util.concurrent.TimeUnit; +import javafx.beans.property.BooleanProperty; + import org.jabref.logic.ai.AiService; import org.jabref.logic.util.io.FileUtil; import org.jabref.logic.xmp.XmpUtilReader; @@ -19,6 +21,7 @@ import dev.langchain4j.data.document.DocumentSplitter; import dev.langchain4j.data.document.Metadata; import dev.langchain4j.data.document.splitter.DocumentSplitters; +import dev.langchain4j.data.segment.TextSegment; import dev.langchain4j.store.embedding.EmbeddingStoreIngestor; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; @@ -34,20 +37,26 @@ public class AiIngestor { private final AiService aiService; private EmbeddingStoreIngestor ingestor; + private DocumentSplitter documentSplitter; + + // A workaround to stop ingesting files. + private BooleanProperty shutdownProperty; - public AiIngestor(AiService aiService) { + public AiIngestor(AiService aiService, BooleanProperty shutdownProperty) { this.aiService = aiService; - this.ingestor = rebuild(aiService); + this.shutdownProperty = shutdownProperty; + + rebuild(aiService); setupListeningToPreferencesChanges(); } - private EmbeddingStoreIngestor rebuild(AiService aiService) { - DocumentSplitter documentSplitter = DocumentSplitters + private void rebuild(AiService aiService) { + this.documentSplitter = DocumentSplitters .recursive(aiService.getPreferences().getDocumentSplitterChunkSize(), aiService.getPreferences().getDocumentSplitterOverlapSize()); - return EmbeddingStoreIngestor + this.ingestor = EmbeddingStoreIngestor .builder() .embeddingStore(aiService.getEmbeddingsManager().getEmbeddingsStore()) .embeddingModel(aiService.getEmbeddingModel().getEmbeddingModel()) @@ -56,7 +65,7 @@ private EmbeddingStoreIngestor rebuild(AiService aiService) { } private void setupListeningToPreferencesChanges() { - aiService.getPreferences().onEmbeddingsParametersChange(() -> ingestor = rebuild(aiService)); + aiService.getPreferences().onEmbeddingsParametersChange(() -> rebuild(aiService)); } /** @@ -120,11 +129,17 @@ private void ingestPDFFile(Path path, Metadata metadata) { } } - private void ingestString(String string, Metadata metadata) { + private void ingestString(String string, Metadata metadata) throws InterruptedException { ingestDocument(new Document(string, metadata)); } private void ingestDocument(Document document) { - ingestor.ingest(document); + for (TextSegment documentPart : documentSplitter.split(document)) { + if (shutdownProperty.get()) { + return; + } + + ingestor.ingest(new Document(documentPart.text(), document.metadata())); + } } } diff --git a/src/main/java/org/jabref/logic/ai/embeddings/EmbeddingsGenerationTask.java b/src/main/java/org/jabref/logic/ai/embeddings/EmbeddingsGenerationTask.java index c33246ef90d..c6c3e4ad411 100644 --- a/src/main/java/org/jabref/logic/ai/embeddings/EmbeddingsGenerationTask.java +++ b/src/main/java/org/jabref/logic/ai/embeddings/EmbeddingsGenerationTask.java @@ -7,6 +7,9 @@ import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; +import javafx.beans.property.BooleanProperty; +import javafx.beans.property.SimpleBooleanProperty; + import org.jabref.gui.util.BackgroundTask; import org.jabref.gui.util.TaskExecutor; import org.jabref.gui.util.UiTaskExecutor; @@ -40,9 +43,10 @@ public class EmbeddingsGenerationTask extends BackgroundTask { private final List linkedFileQueue = Collections.synchronizedList(new ArrayList<>()); private int numOfProcessedFiles = 0; - private final Object lock = new Object(); - private AtomicBoolean isRunning = new AtomicBoolean(false); - private AtomicBoolean isBlockingNewTasks = new AtomicBoolean(false); + private final AtomicBoolean isRunning = new AtomicBoolean(false); + private final AtomicBoolean isBlockingNewTasks = new AtomicBoolean(false); + + private final BooleanProperty shutdownProperty = new SimpleBooleanProperty(false); public EmbeddingsGenerationTask(BibDatabaseContext databaseContext, FilePreferences filePreferences, AiService aiService, TaskExecutor taskExecutor) { this.databaseContext = databaseContext; @@ -50,7 +54,7 @@ public EmbeddingsGenerationTask(BibDatabaseContext databaseContext, FilePreferen this.aiService = aiService; this.taskExecutor = taskExecutor; - this.aiIngestor = new AiIngestor(aiService); + this.aiIngestor = new AiIngestor(aiService, shutdownProperty); configure(); @@ -58,10 +62,12 @@ public EmbeddingsGenerationTask(BibDatabaseContext databaseContext, FilePreferen } private void configure() { - showToUser(true); willBeRecoveredAutomatically(true); - updateProgress(1, 1); titleProperty().set(Localization.lang("Embeddings generation")); + + this.onFailure(e -> { + throw new RuntimeException(e); + }); } private void setupListeningToPreferencesChanges() { @@ -86,7 +92,6 @@ public void addToStore(LinkedFile linkedFile) { if (!isRunning.get()) { this.executeWith(taskExecutor); - showToUser(false); } } } @@ -138,6 +143,7 @@ public void updateEmbeddings(BibDatabaseContext bibDatabaseContext) { @Override protected Void call() throws Exception { isRunning.set(true); + showToUser(true); updateProgress(); @@ -151,6 +157,7 @@ protected Void call() throws Exception { } isRunning.set(false); + showToUser(false); return null; } @@ -198,6 +205,6 @@ public void updateDatabaseName(String name) { public void shutdown() { linkedFileQueue.clear(); - // TODO: Stop the AiIngestor. + shutdownProperty.set(true); } } diff --git a/src/main/java/org/jabref/logic/ai/embeddings/MVStoreEmbeddingStore.java b/src/main/java/org/jabref/logic/ai/embeddings/MVStoreEmbeddingStore.java index f37c5757faa..1993801a4b5 100644 --- a/src/main/java/org/jabref/logic/ai/embeddings/MVStoreEmbeddingStore.java +++ b/src/main/java/org/jabref/logic/ai/embeddings/MVStoreEmbeddingStore.java @@ -25,6 +25,7 @@ import dev.langchain4j.store.embedding.filter.comparison.IsEqualTo; import dev.langchain4j.store.embedding.filter.comparison.IsIn; import jakarta.annotation.Nullable; +import org.h2.mvstore.MVMap; import org.h2.mvstore.MVStore; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,6 +34,10 @@ /** * A custom implementation of langchain4j's {@link EmbeddingStore} that uses a {@link MVStore} as an embedded database. + *

+ * Every embedding has 3 fields: float array (the embedding itself), file where it was generated from, and the embedded + * string (the content). Each of those fields is stored in a separate {@link MVMap}. + * To connect values in those fields we use an id, which is a random {@link UUID}. */ public class MVStoreEmbeddingStore implements EmbeddingStore { public static final String LINKED_FILE_METADATA_KEY = "linkedFile";