-
-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add SpaCy processor for NLP text analysis
- Introduced SpaCyProcessor to handle various file formats (PDF, DOCX, TXT, CSV) - Supports recursive text splitting for chunked processing - Applies spaCy NLP pipeline for tokenization and entity recognition on file content
- Loading branch information
1 parent
063bbd3
commit 32a82bd
Showing
1 changed file
with
105 additions
and
0 deletions.
There are no files selected for viewing
105 changes: 105 additions & 0 deletions
105
core/quivr_core/processor/implementations/spaCy_processor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import logging | ||
import os | ||
|
||
import spacy | ||
import aiofiles | ||
import pandas as pd | ||
from langchain_core.documents import Document | ||
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter | ||
|
||
from quivr_core.files.file import QuivrFile | ||
from quivr_core.processor.processor_base import ProcessorBase | ||
from quivr_core.processor.registry import FileExtension | ||
from quivr_core.processor.splitter import SplitterConfig | ||
|
||
logger = logging.getLogger("quivr_core") | ||
|
||
|
||
class SpaCyProcessor(ProcessorBase): | ||
""" | ||
SpaCyProcessor for handling various text file types with spaCy NLP. | ||
It extracts and processes text content using spaCy's NLP pipeline. | ||
## Installation | ||
```bash | ||
pip install spacy pandas | ||
python -m spacy download en_core_web_sm | ||
``` | ||
""" | ||
|
||
supported_extensions = [ | ||
FileExtension.pdf, | ||
FileExtension.docx, | ||
FileExtension.txt, | ||
FileExtension.csv, | ||
] | ||
|
||
def __init__( | ||
self, | ||
splitter: TextSplitter | None = None, | ||
splitter_config: SplitterConfig = SplitterConfig(), | ||
spacy_model: str = "en_core_web_sm" | ||
) -> None: | ||
self.nlp = spacy.load(spacy_model) | ||
self.splitter_config = splitter_config | ||
|
||
if splitter: | ||
self.text_splitter = splitter | ||
else: | ||
self.text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | ||
chunk_size=splitter_config.chunk_size, | ||
chunk_overlap=splitter_config.chunk_overlap, | ||
) | ||
|
||
@property | ||
def processor_metadata(self): | ||
return { | ||
"processor_cls": "SpaCyProcessor", | ||
"chunk_overlap": self.splitter_config.chunk_overlap, | ||
} | ||
|
||
async def process_file_inner(self, file: QuivrFile) -> list[Document]: | ||
# Extract text based on file type | ||
if file.extension == FileExtension.pdf: | ||
text = await self.extract_text_from_pdf(file) | ||
elif file.extension == FileExtension.docx: | ||
text = await self.extract_text_from_docx(file) | ||
elif file.extension == FileExtension.txt: | ||
text = await self.extract_text_from_txt(file) | ||
elif file.extension == FileExtension.csv: | ||
text = await self.extract_text_from_csv(file) | ||
else: | ||
raise ValueError(f"Unsupported file type: {file.extension}") | ||
|
||
# Apply spaCy NLP processing | ||
doc = Document(page_content=text) | ||
processed_docs = self.text_splitter.split_documents([doc]) | ||
|
||
for doc in processed_docs: | ||
doc.metadata = {"chunk_size": len(self.nlp(doc.page_content))} | ||
# Run spaCy NLP on each chunk | ||
doc.page_content = self.nlp(doc.page_content).text | ||
|
||
return processed_docs | ||
|
||
async def extract_text_from_pdf(self, file: QuivrFile) -> str: | ||
# Placeholder for PDF text extraction | ||
async with file.open() as f: | ||
# PDF text extraction logic here | ||
return "Extracted PDF text" | ||
|
||
async def extract_text_from_docx(self, file: QuivrFile) -> str: | ||
# Placeholder for DOCX text extraction | ||
async with file.open() as f: | ||
# DOCX text extraction logic here | ||
return "Extracted DOCX text" | ||
|
||
async def extract_text_from_txt(self, file: QuivrFile) -> str: | ||
async with aiofiles.open(file.path, mode="r") as f: | ||
content = await f.read() | ||
return content | ||
|
||
async def extract_text_from_csv(self, file: QuivrFile) -> str: | ||
df = pd.read_csv(file.path) | ||
return ' '.join(df.astype(str).values.flatten()) |