From 830e97ff5be4e18f435d49f496d0e30de23af2ee Mon Sep 17 00:00:00 2001 From: InAnYan Date: Wed, 4 Sep 2024 12:52:35 +0300 Subject: [PATCH] Might be fix for https://github.com/InAnYan/jabref/issues/178 --- .../ai/ingestion/FileEmbeddingsManager.java | 10 +++---- .../logic/ai/ingestion/FileToDocument.java | 26 ++++++++++++++----- .../GenerateEmbeddingsForSeveralTask.java | 23 +++++++++++----- .../ai/ingestion/GenerateEmbeddingsTask.java | 10 +++++-- .../logic/ai/ingestion/IngestionService.java | 12 ++++++--- .../ai/summarization/GenerateSummaryTask.java | 2 +- .../pdf/InterruptablePDFTextStripper.java | 26 +++++++++++++++++++ 7 files changed, 84 insertions(+), 25 deletions(-) create mode 100644 src/main/java/org/jabref/logic/pdf/InterruptablePDFTextStripper.java diff --git a/src/main/java/org/jabref/logic/ai/ingestion/FileEmbeddingsManager.java b/src/main/java/org/jabref/logic/ai/ingestion/FileEmbeddingsManager.java index 9814cf0ed7b..c366874885f 100644 --- a/src/main/java/org/jabref/logic/ai/ingestion/FileEmbeddingsManager.java +++ b/src/main/java/org/jabref/logic/ai/ingestion/FileEmbeddingsManager.java @@ -31,20 +31,20 @@ public class FileEmbeddingsManager { public static final String LINK_METADATA_KEY = "link"; private final AiPreferences aiPreferences; - private final ReadOnlyBooleanProperty shutdownProperty; + private final ReadOnlyBooleanProperty shutdownSignal; private final EmbeddingStore embeddingStore; private final FullyIngestedDocumentsTracker fullyIngestedDocumentsTracker; private final LowLevelIngestor lowLevelIngestor; public FileEmbeddingsManager(AiPreferences aiPreferences, - ReadOnlyBooleanProperty shutdownProperty, + ReadOnlyBooleanProperty shutdownSignal, EmbeddingModel embeddingModel, EmbeddingStore embeddingStore, FullyIngestedDocumentsTracker fullyIngestedDocumentsTracker ) { this.aiPreferences = aiPreferences; - this.shutdownProperty = shutdownProperty; + this.shutdownSignal = shutdownSignal; this.embeddingStore = embeddingStore; this.fullyIngestedDocumentsTracker = fullyIngestedDocumentsTracker; this.lowLevelIngestor = new LowLevelIngestor(aiPreferences, embeddingStore, embeddingModel); @@ -58,9 +58,9 @@ private void setupListeningToPreferencesChanges() { public void addDocument(String link, Document document, long modificationTimeInSeconds, IntegerProperty workDone, IntegerProperty workMax) throws InterruptedException { document.metadata().put(LINK_METADATA_KEY, link); - lowLevelIngestor.ingestDocument(document, shutdownProperty, workDone, workMax); + lowLevelIngestor.ingestDocument(document, shutdownSignal, workDone, workMax); - if (!shutdownProperty.get()) { + if (!shutdownSignal.get()) { fullyIngestedDocumentsTracker.markDocumentAsFullyIngested(link, modificationTimeInSeconds); } } diff --git a/src/main/java/org/jabref/logic/ai/ingestion/FileToDocument.java b/src/main/java/org/jabref/logic/ai/ingestion/FileToDocument.java index 1195694ff45..0d8cb119906 100644 --- a/src/main/java/org/jabref/logic/ai/ingestion/FileToDocument.java +++ b/src/main/java/org/jabref/logic/ai/ingestion/FileToDocument.java @@ -4,47 +4,59 @@ import java.nio.file.Path; import java.util.Optional; +import javafx.beans.property.ReadOnlyBooleanProperty; + +import org.jabref.logic.pdf.InterruptablePDFTextStripper; import org.jabref.logic.util.io.FileUtil; import org.jabref.logic.xmp.XmpUtilReader; import dev.langchain4j.data.document.Document; import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.text.PDFTextStripper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class FileToDocument { private static final Logger LOGGER = LoggerFactory.getLogger(FileToDocument.class); - public static Optional fromFile(Path path) { + private final ReadOnlyBooleanProperty shutdownSignal; + + public FileToDocument(ReadOnlyBooleanProperty shutdownSignal) { + this.shutdownSignal = shutdownSignal; + } + + public Optional fromFile(Path path) { if (FileUtil.isPDFFile(path)) { - return FileToDocument.fromPdfFile(path); + return fromPdfFile(path); } else { LOGGER.info("Unsupported file type of file: {}. Currently, only PDF files are supported", path); return Optional.empty(); } } - private static Optional fromPdfFile(Path path) { + private Optional fromPdfFile(Path path) { // This method is private to ensure that the path is really pointing to PDF file (determined by extension). try (PDDocument document = new XmpUtilReader().loadWithAutomaticDecryption(path)) { int lastPage = document.getNumberOfPages(); StringWriter writer = new StringWriter(); - PDFTextStripper stripper = new PDFTextStripper(); + InterruptablePDFTextStripper stripper = new InterruptablePDFTextStripper(shutdownSignal); stripper.setStartPage(1); stripper.setEndPage(lastPage); stripper.writeText(document, writer); - return FileToDocument.fromString(writer.toString()); + if (shutdownSignal.get()) { + return Optional.empty(); + } + + return fromString(writer.toString()); } catch (Exception e) { LOGGER.error("An error occurred while reading the PDF file: {}", path, e); return Optional.empty(); } } - public static Optional fromString(String content) { + public Optional fromString(String content) { return Optional.of(new Document(content)); } } diff --git a/src/main/java/org/jabref/logic/ai/ingestion/GenerateEmbeddingsForSeveralTask.java b/src/main/java/org/jabref/logic/ai/ingestion/GenerateEmbeddingsForSeveralTask.java index 66cd04de426..43a0c813e7d 100644 --- a/src/main/java/org/jabref/logic/ai/ingestion/GenerateEmbeddingsForSeveralTask.java +++ b/src/main/java/org/jabref/logic/ai/ingestion/GenerateEmbeddingsForSeveralTask.java @@ -4,6 +4,7 @@ import java.util.List; import java.util.concurrent.Future; +import javafx.beans.property.ReadOnlyBooleanProperty; import javafx.beans.property.StringProperty; import javafx.util.Pair; @@ -34,6 +35,7 @@ public class GenerateEmbeddingsForSeveralTask extends BackgroundTask { private final BibDatabaseContext bibDatabaseContext; private final FilePreferences filePreferences; private final TaskExecutor taskExecutor; + private final ReadOnlyBooleanProperty shutdownSignal; private final ProgressCounter progressCounter = new ProgressCounter(); @@ -45,7 +47,8 @@ public GenerateEmbeddingsForSeveralTask( FileEmbeddingsManager fileEmbeddingsManager, BibDatabaseContext bibDatabaseContext, FilePreferences filePreferences, - TaskExecutor taskExecutor + TaskExecutor taskExecutor, + ReadOnlyBooleanProperty shutdownSignal ) { this.name = name; this.linkedFiles = linkedFiles; @@ -53,6 +56,7 @@ public GenerateEmbeddingsForSeveralTask( this.bibDatabaseContext = bibDatabaseContext; this.filePreferences = filePreferences; this.taskExecutor = taskExecutor; + this.shutdownSignal = shutdownSignal; configure(name); } @@ -76,11 +80,18 @@ protected Void call() throws Exception { .stream() .map(processingInfo -> { processingInfo.setState(ProcessingState.PROCESSING); - return new Pair<>(new GenerateEmbeddingsTask(processingInfo.getObject(), fileEmbeddingsManager, bibDatabaseContext, filePreferences) - .onSuccess(v -> processingInfo.setState(ProcessingState.SUCCESS)) - .onFailure(processingInfo::setException) - .onFinished(() -> progressCounter.increaseWorkDone(1)) - .executeWith(taskExecutor), + return new Pair<>( + new GenerateEmbeddingsTask( + processingInfo.getObject(), + fileEmbeddingsManager, + bibDatabaseContext, + filePreferences, + shutdownSignal + ) + .onSuccess(v -> processingInfo.setState(ProcessingState.SUCCESS)) + .onFailure(processingInfo::setException) + .onFinished(() -> progressCounter.increaseWorkDone(1)) + .executeWith(taskExecutor), processingInfo.getObject().getLink()); }) .forEach(futures::add); diff --git a/src/main/java/org/jabref/logic/ai/ingestion/GenerateEmbeddingsTask.java b/src/main/java/org/jabref/logic/ai/ingestion/GenerateEmbeddingsTask.java index 33c603031b1..e0e1561f267 100644 --- a/src/main/java/org/jabref/logic/ai/ingestion/GenerateEmbeddingsTask.java +++ b/src/main/java/org/jabref/logic/ai/ingestion/GenerateEmbeddingsTask.java @@ -7,6 +7,8 @@ import java.util.Optional; import java.util.concurrent.TimeUnit; +import javafx.beans.property.ReadOnlyBooleanProperty; + import org.jabref.gui.util.BackgroundTask; import org.jabref.logic.l10n.Localization; import org.jabref.logic.util.ProgressCounter; @@ -30,17 +32,21 @@ public class GenerateEmbeddingsTask extends BackgroundTask { private final FileEmbeddingsManager fileEmbeddingsManager; private final BibDatabaseContext bibDatabaseContext; private final FilePreferences filePreferences; + private final ReadOnlyBooleanProperty shutdownSignal; private final ProgressCounter progressCounter = new ProgressCounter(); public GenerateEmbeddingsTask(LinkedFile linkedFile, FileEmbeddingsManager fileEmbeddingsManager, BibDatabaseContext bibDatabaseContext, - FilePreferences filePreferences) { + FilePreferences filePreferences, + ReadOnlyBooleanProperty shutdownSignal + ) { this.linkedFile = linkedFile; this.fileEmbeddingsManager = fileEmbeddingsManager; this.bibDatabaseContext = bibDatabaseContext; this.filePreferences = filePreferences; + this.shutdownSignal = shutdownSignal; configure(linkedFile); } @@ -109,7 +115,7 @@ private void ingestLinkedFile(LinkedFile linkedFile) throws InterruptedException return; } - Optional document = FileToDocument.fromFile(path.get()); + Optional document = new FileToDocument(shutdownSignal).fromFile(path.get()); if (document.isPresent()) { fileEmbeddingsManager.addDocument(linkedFile.getLink(), document.get(), modTime.orElse(0L), progressCounter.workDoneProperty(), progressCounter.workMaxProperty()); LOGGER.debug("Embeddings for file \"{}\" were generated successfully", linkedFile.getLink()); diff --git a/src/main/java/org/jabref/logic/ai/ingestion/IngestionService.java b/src/main/java/org/jabref/logic/ai/ingestion/IngestionService.java index aebb21d3bda..9e692a04b36 100644 --- a/src/main/java/org/jabref/logic/ai/ingestion/IngestionService.java +++ b/src/main/java/org/jabref/logic/ai/ingestion/IngestionService.java @@ -34,8 +34,10 @@ public class IngestionService { private final FileEmbeddingsManager fileEmbeddingsManager; + private final ReadOnlyBooleanProperty shutdownSignal; + public IngestionService(AiPreferences aiPreferences, - ReadOnlyBooleanProperty shutdownProperty, + ReadOnlyBooleanProperty shutdownSignal, EmbeddingModel embeddingModel, EmbeddingStore embeddingStore, FullyIngestedDocumentsTracker fullyIngestedDocumentsTracker, @@ -47,11 +49,13 @@ public IngestionService(AiPreferences aiPreferences, this.fileEmbeddingsManager = new FileEmbeddingsManager( aiPreferences, - shutdownProperty, + shutdownSignal, embeddingModel, embeddingStore, fullyIngestedDocumentsTracker ); + + this.shutdownSignal = shutdownSignal; } /** @@ -96,14 +100,14 @@ public List> ingest(StringProperty name, List processingInfo) { - new GenerateEmbeddingsTask(linkedFile, fileEmbeddingsManager, bibDatabaseContext, filePreferences) + new GenerateEmbeddingsTask(linkedFile, fileEmbeddingsManager, bibDatabaseContext, filePreferences, shutdownSignal) .onSuccess(v -> processingInfo.setState(ProcessingState.SUCCESS)) .onFailure(processingInfo::setException) .executeWith(taskExecutor); } private void startEmbeddingsGenerationTask(StringProperty name, List> linkedFiles, BibDatabaseContext bibDatabaseContext) { - new GenerateEmbeddingsForSeveralTask(name, linkedFiles, fileEmbeddingsManager, bibDatabaseContext, filePreferences, taskExecutor) + new GenerateEmbeddingsForSeveralTask(name, linkedFiles, fileEmbeddingsManager, bibDatabaseContext, filePreferences, taskExecutor, shutdownSignal) .executeWith(taskExecutor); } diff --git a/src/main/java/org/jabref/logic/ai/summarization/GenerateSummaryTask.java b/src/main/java/org/jabref/logic/ai/summarization/GenerateSummaryTask.java index f2bd66b639a..1dfb447ecc5 100644 --- a/src/main/java/org/jabref/logic/ai/summarization/GenerateSummaryTask.java +++ b/src/main/java/org/jabref/logic/ai/summarization/GenerateSummaryTask.java @@ -170,7 +170,7 @@ private Optional generateSummary(LinkedFile linkedFile) throws Interrupt return Optional.empty(); } - Optional document = FileToDocument.fromFile(path.get()); + Optional document = new FileToDocument(shutdownSignal).fromFile(path.get()); if (document.isEmpty()) { LOGGER.warn("Could not extract text from a linked file \"{}\" of entry {}. It will be skipped when generating a summary.", linkedFile.getLink(), citationKey); diff --git a/src/main/java/org/jabref/logic/pdf/InterruptablePDFTextStripper.java b/src/main/java/org/jabref/logic/pdf/InterruptablePDFTextStripper.java new file mode 100644 index 00000000000..ec136da8959 --- /dev/null +++ b/src/main/java/org/jabref/logic/pdf/InterruptablePDFTextStripper.java @@ -0,0 +1,26 @@ +package org.jabref.logic.pdf; + +import java.io.IOException; + +import javafx.beans.property.ReadOnlyBooleanProperty; + +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.text.PDFTextStripper; + +public class InterruptablePDFTextStripper extends PDFTextStripper { + private final ReadOnlyBooleanProperty shutdownSignal; + + public InterruptablePDFTextStripper(ReadOnlyBooleanProperty shutdownSignal) { + super(); + this.shutdownSignal = shutdownSignal; + } + + @Override + public void processPage(PDPage page) throws IOException { + if (shutdownSignal.get()) { + return; + } + + super.processPage(page); + } +}