Skip to content

Commit

Permalink
Might be fix for #178
Browse files Browse the repository at this point in the history
  • Loading branch information
InAnYan committed Sep 4, 2024
1 parent c007615 commit 830e97f
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,20 @@ public class FileEmbeddingsManager {
public static final String LINK_METADATA_KEY = "link";

private final AiPreferences aiPreferences;
private final ReadOnlyBooleanProperty shutdownProperty;
private final ReadOnlyBooleanProperty shutdownSignal;

private final EmbeddingStore<TextSegment> embeddingStore;
private final FullyIngestedDocumentsTracker fullyIngestedDocumentsTracker;
private final LowLevelIngestor lowLevelIngestor;

public FileEmbeddingsManager(AiPreferences aiPreferences,
ReadOnlyBooleanProperty shutdownProperty,
ReadOnlyBooleanProperty shutdownSignal,
EmbeddingModel embeddingModel,
EmbeddingStore<TextSegment> embeddingStore,
FullyIngestedDocumentsTracker fullyIngestedDocumentsTracker
) {
this.aiPreferences = aiPreferences;
this.shutdownProperty = shutdownProperty;
this.shutdownSignal = shutdownSignal;
this.embeddingStore = embeddingStore;
this.fullyIngestedDocumentsTracker = fullyIngestedDocumentsTracker;
this.lowLevelIngestor = new LowLevelIngestor(aiPreferences, embeddingStore, embeddingModel);
Expand All @@ -58,9 +58,9 @@ private void setupListeningToPreferencesChanges() {

public void addDocument(String link, Document document, long modificationTimeInSeconds, IntegerProperty workDone, IntegerProperty workMax) throws InterruptedException {
document.metadata().put(LINK_METADATA_KEY, link);
lowLevelIngestor.ingestDocument(document, shutdownProperty, workDone, workMax);
lowLevelIngestor.ingestDocument(document, shutdownSignal, workDone, workMax);

if (!shutdownProperty.get()) {
if (!shutdownSignal.get()) {
fullyIngestedDocumentsTracker.markDocumentAsFullyIngested(link, modificationTimeInSeconds);
}
}
Expand Down
26 changes: 19 additions & 7 deletions src/main/java/org/jabref/logic/ai/ingestion/FileToDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,59 @@
import java.nio.file.Path;
import java.util.Optional;

import javafx.beans.property.ReadOnlyBooleanProperty;

import org.jabref.logic.pdf.InterruptablePDFTextStripper;
import org.jabref.logic.util.io.FileUtil;
import org.jabref.logic.xmp.XmpUtilReader;

import dev.langchain4j.data.document.Document;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FileToDocument {
private static final Logger LOGGER = LoggerFactory.getLogger(FileToDocument.class);

public static Optional<Document> fromFile(Path path) {
private final ReadOnlyBooleanProperty shutdownSignal;

public FileToDocument(ReadOnlyBooleanProperty shutdownSignal) {
this.shutdownSignal = shutdownSignal;
}

public Optional<Document> fromFile(Path path) {
if (FileUtil.isPDFFile(path)) {
return FileToDocument.fromPdfFile(path);
return fromPdfFile(path);
} else {
LOGGER.info("Unsupported file type of file: {}. Currently, only PDF files are supported", path);
return Optional.empty();
}
}

private static Optional<Document> fromPdfFile(Path path) {
private Optional<Document> fromPdfFile(Path path) {
// This method is private to ensure that the path is really pointing to PDF file (determined by extension).

try (PDDocument document = new XmpUtilReader().loadWithAutomaticDecryption(path)) {
int lastPage = document.getNumberOfPages();
StringWriter writer = new StringWriter();

PDFTextStripper stripper = new PDFTextStripper();
InterruptablePDFTextStripper stripper = new InterruptablePDFTextStripper(shutdownSignal);
stripper.setStartPage(1);
stripper.setEndPage(lastPage);
stripper.writeText(document, writer);

return FileToDocument.fromString(writer.toString());
if (shutdownSignal.get()) {
return Optional.empty();
}

return fromString(writer.toString());
} catch (Exception e) {
LOGGER.error("An error occurred while reading the PDF file: {}", path, e);
return Optional.empty();
}
}

public static Optional<Document> fromString(String content) {
public Optional<Document> fromString(String content) {
return Optional.of(new Document(content));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import java.util.List;
import java.util.concurrent.Future;

import javafx.beans.property.ReadOnlyBooleanProperty;
import javafx.beans.property.StringProperty;
import javafx.util.Pair;

Expand Down Expand Up @@ -34,6 +35,7 @@ public class GenerateEmbeddingsForSeveralTask extends BackgroundTask<Void> {
private final BibDatabaseContext bibDatabaseContext;
private final FilePreferences filePreferences;
private final TaskExecutor taskExecutor;
private final ReadOnlyBooleanProperty shutdownSignal;

private final ProgressCounter progressCounter = new ProgressCounter();

Expand All @@ -45,14 +47,16 @@ public GenerateEmbeddingsForSeveralTask(
FileEmbeddingsManager fileEmbeddingsManager,
BibDatabaseContext bibDatabaseContext,
FilePreferences filePreferences,
TaskExecutor taskExecutor
TaskExecutor taskExecutor,
ReadOnlyBooleanProperty shutdownSignal
) {
this.name = name;
this.linkedFiles = linkedFiles;
this.fileEmbeddingsManager = fileEmbeddingsManager;
this.bibDatabaseContext = bibDatabaseContext;
this.filePreferences = filePreferences;
this.taskExecutor = taskExecutor;
this.shutdownSignal = shutdownSignal;

configure(name);
}
Expand All @@ -76,11 +80,18 @@ protected Void call() throws Exception {
.stream()
.map(processingInfo -> {
processingInfo.setState(ProcessingState.PROCESSING);
return new Pair<>(new GenerateEmbeddingsTask(processingInfo.getObject(), fileEmbeddingsManager, bibDatabaseContext, filePreferences)
.onSuccess(v -> processingInfo.setState(ProcessingState.SUCCESS))
.onFailure(processingInfo::setException)
.onFinished(() -> progressCounter.increaseWorkDone(1))
.executeWith(taskExecutor),
return new Pair<>(
new GenerateEmbeddingsTask(
processingInfo.getObject(),
fileEmbeddingsManager,
bibDatabaseContext,
filePreferences,
shutdownSignal
)
.onSuccess(v -> processingInfo.setState(ProcessingState.SUCCESS))
.onFailure(processingInfo::setException)
.onFinished(() -> progressCounter.increaseWorkDone(1))
.executeWith(taskExecutor),
processingInfo.getObject().getLink());
})
.forEach(futures::add);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import java.util.Optional;
import java.util.concurrent.TimeUnit;

import javafx.beans.property.ReadOnlyBooleanProperty;

import org.jabref.gui.util.BackgroundTask;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.ProgressCounter;
Expand All @@ -30,17 +32,21 @@ public class GenerateEmbeddingsTask extends BackgroundTask<Void> {
private final FileEmbeddingsManager fileEmbeddingsManager;
private final BibDatabaseContext bibDatabaseContext;
private final FilePreferences filePreferences;
private final ReadOnlyBooleanProperty shutdownSignal;

private final ProgressCounter progressCounter = new ProgressCounter();

public GenerateEmbeddingsTask(LinkedFile linkedFile,
FileEmbeddingsManager fileEmbeddingsManager,
BibDatabaseContext bibDatabaseContext,
FilePreferences filePreferences) {
FilePreferences filePreferences,
ReadOnlyBooleanProperty shutdownSignal
) {
this.linkedFile = linkedFile;
this.fileEmbeddingsManager = fileEmbeddingsManager;
this.bibDatabaseContext = bibDatabaseContext;
this.filePreferences = filePreferences;
this.shutdownSignal = shutdownSignal;

configure(linkedFile);
}
Expand Down Expand Up @@ -109,7 +115,7 @@ private void ingestLinkedFile(LinkedFile linkedFile) throws InterruptedException
return;
}

Optional<Document> document = FileToDocument.fromFile(path.get());
Optional<Document> document = new FileToDocument(shutdownSignal).fromFile(path.get());
if (document.isPresent()) {
fileEmbeddingsManager.addDocument(linkedFile.getLink(), document.get(), modTime.orElse(0L), progressCounter.workDoneProperty(), progressCounter.workMaxProperty());
LOGGER.debug("Embeddings for file \"{}\" were generated successfully", linkedFile.getLink());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ public class IngestionService {

private final FileEmbeddingsManager fileEmbeddingsManager;

private final ReadOnlyBooleanProperty shutdownSignal;

public IngestionService(AiPreferences aiPreferences,
ReadOnlyBooleanProperty shutdownProperty,
ReadOnlyBooleanProperty shutdownSignal,
EmbeddingModel embeddingModel,
EmbeddingStore<TextSegment> embeddingStore,
FullyIngestedDocumentsTracker fullyIngestedDocumentsTracker,
Expand All @@ -47,11 +49,13 @@ public IngestionService(AiPreferences aiPreferences,

this.fileEmbeddingsManager = new FileEmbeddingsManager(
aiPreferences,
shutdownProperty,
shutdownSignal,
embeddingModel,
embeddingStore,
fullyIngestedDocumentsTracker
);

this.shutdownSignal = shutdownSignal;
}

/**
Expand Down Expand Up @@ -96,14 +100,14 @@ public List<ProcessingInfo<LinkedFile, Void>> ingest(StringProperty name, List<L
}

private void startEmbeddingsGenerationTask(LinkedFile linkedFile, BibDatabaseContext bibDatabaseContext, ProcessingInfo<LinkedFile, Void> processingInfo) {
new GenerateEmbeddingsTask(linkedFile, fileEmbeddingsManager, bibDatabaseContext, filePreferences)
new GenerateEmbeddingsTask(linkedFile, fileEmbeddingsManager, bibDatabaseContext, filePreferences, shutdownSignal)
.onSuccess(v -> processingInfo.setState(ProcessingState.SUCCESS))
.onFailure(processingInfo::setException)
.executeWith(taskExecutor);
}

private void startEmbeddingsGenerationTask(StringProperty name, List<ProcessingInfo<LinkedFile, Void>> linkedFiles, BibDatabaseContext bibDatabaseContext) {
new GenerateEmbeddingsForSeveralTask(name, linkedFiles, fileEmbeddingsManager, bibDatabaseContext, filePreferences, taskExecutor)
new GenerateEmbeddingsForSeveralTask(name, linkedFiles, fileEmbeddingsManager, bibDatabaseContext, filePreferences, taskExecutor, shutdownSignal)
.executeWith(taskExecutor);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ private Optional<String> generateSummary(LinkedFile linkedFile) throws Interrupt
return Optional.empty();
}

Optional<Document> document = FileToDocument.fromFile(path.get());
Optional<Document> document = new FileToDocument(shutdownSignal).fromFile(path.get());

if (document.isEmpty()) {
LOGGER.warn("Could not extract text from a linked file \"{}\" of entry {}. It will be skipped when generating a summary.", linkedFile.getLink(), citationKey);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package org.jabref.logic.pdf;

import java.io.IOException;

import javafx.beans.property.ReadOnlyBooleanProperty;

import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;

public class InterruptablePDFTextStripper extends PDFTextStripper {
private final ReadOnlyBooleanProperty shutdownSignal;

public InterruptablePDFTextStripper(ReadOnlyBooleanProperty shutdownSignal) {
super();
this.shutdownSignal = shutdownSignal;
}

@Override
public void processPage(PDPage page) throws IOException {
if (shutdownSignal.get()) {
return;
}

super.processPage(page);
}
}

0 comments on commit 830e97f

Please sign in to comment.